def _log_normalization(self, concentration=None, name='log_normalization'): """Returns the log normalization of a CholeskyLKJ distribution. Args: concentration: `float` or `double` `Tensor`. The positive concentration parameter of the CholeskyLKJ distributions. name: Python `str` name prefixed to Ops created by this function. Returns: log_z: A Tensor of the same shape and dtype as `concentration`, containing the corresponding log normalizers. """ # The formula is from D. Lewandowski et al [1], p. 1999, from the # proof that eqs 16 and 17 are equivalent. # Instead of using a for loop for k from 1 to (dimension - 1), we will # vectorize the computation by performing operations on the vector # `dimension_range = np.arange(1, dimension)`. with tf.name_scope(name or 'log_normalization_lkj'): if concentration is None: concentration = tf.convert_to_tensor(self.concentration) logpi = float(np.log(np.pi)) dimension_range = np.arange( 1., self.dimension, dtype=dtype_util.as_numpy_dtype(concentration.dtype)) effective_concentration = ( concentration[..., tf.newaxis] + (self.dimension - 1 - dimension_range) / 2.) ans = tf.reduce_sum( tfp_math.log_gamma_difference(dimension_range / 2., effective_concentration), axis=-1) # Then we add to `ans` the sum of `logpi / 2 * k` for `k` run from 1 to # `dimension - 1`. ans = ans + logpi * (self.dimension * (self.dimension - 1) / 4.) return ans
def _sample_n(self, n, seed=None): temperature = tf.convert_to_tensor(self.temperature) logits = self._logits_parameter_no_checks() # Uniform variates must be sampled from the open-interval `(0, 1)` rather # than `[0, 1)`. To do so, we use # `np.finfo(dtype_util.as_numpy_dtype(self.dtype)).tiny` because it is the # smallest, positive, 'normal' number. A 'normal' number is such that the # mantissa has an implicit leading 1. Normal, positive numbers x, y have the # reasonable property that, `x + y >= max(x, y)`. In this case, a subnormal # number (i.e., np.nextafter) can cause us to sample 0. uniform_shape = tf.concat( [[n], self._batch_shape_tensor(temperature=temperature, logits=logits), self._event_shape_tensor(logits=logits)], 0) uniform = tf.random.uniform( shape=uniform_shape, minval=np.finfo(dtype_util.as_numpy_dtype(self.dtype)).tiny, maxval=1., dtype=self.dtype, seed=seed) gumbel = -tf.math.log(-tf.math.log(uniform)) noisy_logits = (gumbel + logits) / temperature[..., tf.newaxis] return tf.math.log_softmax(noisy_logits)
def _owens_t_method1(h, a, m): """OwensT Method T1 using series expansions.""" # Method T1, which is evaluation of a particular series expansion of OwensT. dtype = dtype_util.common_dtype([h, a], tf.float32) numpy_dtype = dtype_util.as_numpy_dtype(dtype) neg_half_h_squared = -0.5 * tf.math.square(h) a_squared = tf.math.square(a) def series_evaluation(should_stop, index, ai, di, gi, series_sum): new_ai = a_squared * ai new_di = gi - di new_gi = neg_half_h_squared / index * gi new_series_sum = tf.where( should_stop, series_sum, series_sum + new_di * new_ai / (2. * index - 1.)) should_stop = index >= m return should_stop, index + 1., new_ai, new_di, new_gi, new_series_sum broadcast_shape = prefer_static.broadcast_shape(prefer_static.shape(h), prefer_static.shape(a)) initial_ai = a / numpy_dtype(2 * np.pi) initial_di = tf.math.expm1(neg_half_h_squared) initial_gi = neg_half_h_squared * tf.math.exp(neg_half_h_squared) initial_sum = (tf.math.atan(a) / numpy_dtype(2 * np.pi) + initial_ai * initial_di) (_, _, _, _, _, series_sum) = tf.while_loop( cond=lambda stop, *_: tf.reduce_any(~stop), body=series_evaluation, loop_vars=(tf.zeros(broadcast_shape, dtype=tf.bool), tf.cast(2., dtype=dtype), initial_ai, initial_di, initial_gi, initial_sum)) return series_sum
def _sample_n(self, n, seed=None): loc = tf.convert_to_tensor(self.loc) concentration = tf.convert_to_tensor(self.concentration) concentration = tf.broadcast_to( concentration, self._batch_shape_tensor(loc=loc, concentration=concentration)) # random_von_mises does not work for zero concentration, so round it up to # something very small. tiny = np.finfo(dtype_util.as_numpy_dtype(self.dtype)).tiny concentration = tf.maximum(concentration, tiny) sample_batch_shape = tf.concat( [[n], prefer_static.shape(concentration)], axis=0) samples = random_von_mises(sample_batch_shape, concentration, dtype=self.dtype, seed=seed) # vonMises(0, concentration) -> vonMises(loc, concentration) samples = samples + loc # Map the samples to [-pi, pi]. samples = samples - 2. * np.pi * tf.round(samples / (2. * np.pi)) return samples
def _log_prob(self, x): x = tf.convert_to_tensor(value=x, name='x') right_indices = tf.minimum( tf.size(input=self.outcomes) - 1, tf.reshape( tf.searchsorted(self.outcomes, values=tf.reshape(x, shape=[-1]), side='right'), dist_util.prefer_static_shape(x))) use_right_indices = self._is_equal_or_close( x, tf.gather(self.outcomes, indices=right_indices)) left_indices = tf.maximum(0, right_indices - 1) use_left_indices = self._is_equal_or_close( x, tf.gather(self.outcomes, indices=left_indices)) log_probs = self._categorical.log_prob( tf1.where(use_left_indices, left_indices, right_indices)) should_be_neg_inf = tf.broadcast_to( tf.logical_not(use_left_indices | use_right_indices), shape=dist_util.prefer_static_shape(log_probs)) return tf1.where( should_be_neg_inf, tf.fill(dist_util.prefer_static_shape(should_be_neg_inf), dtype_util.as_numpy_dtype(log_probs.dtype)(-np.inf)), log_probs)
def testPoissonLogPmfContinuousRelaxation(self): batch_size = 12 lam = tf.constant([3.0] * batch_size) x = np.array([-3., -0.5, 0., 2., 2.2, 3., 3.1, 4., 5., 5.5, 6., 7.]).astype(np.float32) poisson = self._make_poisson(rate=lam, interpolate_nondiscrete=True) expected_continuous_log_pmf = (x * poisson.log_rate - tf.math.lgamma(1. + x) - poisson.rate) expected_continuous_log_pmf = tf.where( x >= 0., expected_continuous_log_pmf, dtype_util.as_numpy_dtype( expected_continuous_log_pmf.dtype)(-np.inf)) expected_continuous_pmf = tf.exp(expected_continuous_log_pmf) log_pmf = poisson.log_prob(x) self.assertEqual((batch_size, ), log_pmf.shape) self.assertAllClose(self.evaluate(log_pmf), self.evaluate(expected_continuous_log_pmf)) pmf = poisson.prob(x) self.assertEqual((batch_size, ), pmf.shape) self.assertAllClose(self.evaluate(pmf), self.evaluate(expected_continuous_pmf))
def verify_expectations(self, dimension, dtype): num_samples = int(1e6) # pylint: disable=protected-access x = tfd.lkj._tril_spherical_uniform(dimension=dimension, batch_shape=[num_samples], dtype=dtype, seed=test_util.test_seed()) # pylint: enable=protected-access self.assertEqual(dtype, dtype_util.as_numpy_dtype(x.dtype)) final_shape = [num_samples, dimension, dimension] self.assertAllEqual(final_shape, x.shape) sample_mean = tf.reduce_mean(x, axis=0) sample_var = tf.reduce_mean(tf.math.squared_difference(x, sample_mean), axis=0) samples, sample_mean, sample_var = self.evaluate( [x, sample_mean, sample_var]) self.assertAllMeansClose(samples, np.zeros_like(sample_mean), axis=0, atol=3e-3, rtol=1e-3) expected_var = np.tril(np.ones([dimension, dimension], dtype=dtype)) expected_var = expected_var / np.arange(1, dimension + 1)[..., None] self.assertAllClose(expected_var, sample_var, atol=2e-3, rtol=1e-2)
def _bessel_kve_naive(v, z): """Compute bessel_kve(v, z).""" dtype = dtype_util.common_dtype([v, z], tf.float32) numpy_dtype = dtype_util.as_numpy_dtype(dtype) v = tf.convert_to_tensor(v, dtype=dtype) z = tf.convert_to_tensor(z, dtype=dtype) # K_{-v} == K_{v} for negative values. v = tf.math.abs(v) z_abs = tf.math.abs(z) # Handle the zero case specially. z_abs = tf.where(tf.math.equal(z_abs, 0.), numpy_dtype(1.), z_abs) small_v = tf.where(v < 50., v, numpy_dtype(0.1)) large_v = tf.where(v >= 50., v, numpy_dtype(1000.)) _, olver_kve = _olver_asymptotic_uniform(large_v, z_abs) temme_kve = _temme_expansion(small_v, z_abs)[1] kve = tf.where(v >= 50., olver_kve, temme_kve) # Handle when z is zero. kve = tf.where(tf.math.equal(z, 0.), numpy_dtype(np.inf), kve) return tf.where(z < 0., numpy_dtype(np.nan), kve)
def fit_one_step( model_matrix, response, model, model_coefficients_start=None, predicted_linear_response_start=None, l2_regularizer=None, dispersion=None, offset=None, learning_rate=None, fast_unsafe_numerics=True, l2_regularization_penalty_factor=None, name=None): """Runs one step of Fisher scoring. Args: model_matrix: (Batch of) `float`-like, matrix-shaped `Tensor` where each row represents a sample's features. response: (Batch of) vector-shaped `Tensor` where each element represents a sample's observed response (to the corresponding row of features). Must have same `dtype` as `model_matrix`. model: `tfp.glm.ExponentialFamily`-like instance used to construct the negative log-likelihood loss, gradient, and expected Hessian (i.e., the Fisher information matrix). model_coefficients_start: Optional (batch of) vector-shaped `Tensor` representing the initial model coefficients, one for each column in `model_matrix`. Must have same `dtype` as `model_matrix`. Default value: Zeros. predicted_linear_response_start: Optional `Tensor` with `shape`, `dtype` matching `response`; represents `offset` shifted initial linear predictions based on `model_coefficients_start`. Default value: `offset` if `model_coefficients is None`, and `tf.linalg.matvec(model_matrix, model_coefficients_start) + offset` otherwise. l2_regularizer: Optional scalar `Tensor` representing L2 regularization penalty, i.e., `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w||_2^2`. Default value: `None` (i.e., no L2 regularization). dispersion: Optional (batch of) `Tensor` representing `response` dispersion, i.e., as in, `p(y|theta) := exp((y theta - A(theta)) / dispersion)`. Must broadcast with rows of `model_matrix`. Default value: `None` (i.e., "no dispersion"). offset: Optional `Tensor` representing constant shift applied to `predicted_linear_response`. Must broadcast to `response`. Default value: `None` (i.e., `tf.zeros_like(response)`). learning_rate: Optional (batch of) scalar `Tensor` used to dampen iterative progress. Typically only needed if optimization diverges, should be no larger than `1` and typically very close to `1`. Default value: `None` (i.e., `1`). fast_unsafe_numerics: Optional Python `bool` indicating if solve should be based on Cholesky or QR decomposition. Default value: `True` (i.e., "prefer speed via Cholesky decomposition"). l2_regularization_penalty_factor: Optional (batch of) vector-shaped `Tensor`, representing a separate penalty factor to apply to each model coefficient, length equal to columns in `model_matrix`. Each penalty factor multiplies l2_regularizer to allow differential regularization. Can be 0 for some coefficients, which implies no regularization. Default is 1 for all coefficients. `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w * l2_regularization_penalty_factor||_2^2` name: Python `str` used as name prefix to ops created by this function. Default value: `"fit_one_step"`. Returns: model_coefficients: (Batch of) vector-shaped `Tensor`; represents the next estimate of the model coefficients, one for each column in `model_matrix`. predicted_linear_response: `response`-shaped `Tensor` representing linear predictions based on new `model_coefficients`, i.e., `tf.linalg.matvec(model_matrix, model_coefficients_next) + offset`. """ with tf.name_scope(name or 'fit_one_step'): [ model_matrix, response, model_coefficients_start, predicted_linear_response_start, offset, ] = prepare_args( model_matrix, response, model_coefficients_start, predicted_linear_response_start, offset) # Compute: mean, grad(mean, predicted_linear_response_start), and variance. mean, variance, grad_mean = model(predicted_linear_response_start) # If either `grad_mean` or `variance is non-finite or zero, then we'll # replace it with a value such that the row is zeroed out. Although this # procedure may seem circuitous, it is necessary to ensure this algorithm is # itself differentiable. is_valid = ( tf.math.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.) & tf.math.is_finite(variance) & (variance > 0.)) def mask_if_invalid(x, mask): return tf.where( is_valid, x, np.array(mask, dtype_util.as_numpy_dtype(x.dtype))) # Run one step of iteratively reweighted least-squares. # Compute "`z`", the adjusted predicted linear response. # z = predicted_linear_response_start # + learning_rate * (response - mean) / grad_mean z = (response - mean) / mask_if_invalid(grad_mean, 1.) # TODO(jvdillon): Rather than use learning rate, we should consider using # backtracking line search. if learning_rate is not None: z *= learning_rate[..., tf.newaxis] z += predicted_linear_response_start if offset is not None: z -= offset # Compute "`w`", the per-sample weight. if dispersion is not None: # For convenience, we'll now scale the variance by the dispersion factor. variance *= dispersion w = ( mask_if_invalid(grad_mean, 0.) * tf.math.rsqrt(mask_if_invalid(variance, np.inf))) a = model_matrix * w[..., tf.newaxis] b = z * w # Solve `min{ || A @ model_coefficients - b ||_2**2 : model_coefficients }` # where `@` denotes `matmul`. if l2_regularizer is None: l2_regularizer = np.array(0, dtype_util.as_numpy_dtype(a.dtype)) else: l2_regularizer_ = distribution_util.maybe_get_static_value( l2_regularizer, dtype_util.as_numpy_dtype(a.dtype)) if l2_regularizer_ is not None: l2_regularizer = l2_regularizer_ def _embed_l2_regularization(): """Adds synthetic observations to implement L2 regularization.""" # `tf.matrix_solve_ls` does not respect the `l2_regularization` argument # when `fast_unsafe_numerics` is `False`. This function adds synthetic # observations to the data to implement the regularization instead. # Adding observations `sqrt(l2_regularizer) * I` is mathematically # equivalent to adding the term # `-l2_regularizer ||coefficients||_2**2` to the log-likelihood. num_model_coefficients = num_cols(model_matrix) batch_shape = tf.shape(model_matrix)[:-2] if l2_regularization_penalty_factor is None: eye = tf.eye( num_model_coefficients, batch_shape=batch_shape, dtype=a.dtype) else: eye = tf.linalg.tensor_diag( tf.cast(l2_regularization_penalty_factor, dtype=a.dtype)) broadcasted_shape = prefer_static.concat( [batch_shape, [num_model_coefficients, num_model_coefficients]], axis=0) eye = tf.broadcast_to(eye, broadcasted_shape) a_ = tf.concat([a, tf.sqrt(l2_regularizer) * eye], axis=-2) b_ = distribution_util.pad( b, count=num_model_coefficients, axis=-1, back=True) # Return l2_regularizer=0 since its now embedded. l2_regularizer_ = np.array(0, dtype_util.as_numpy_dtype(a.dtype)) return a_, b_, l2_regularizer_ a, b, l2_regularizer = prefer_static.cond( prefer_static.reduce_all([ prefer_static.logical_or( not(fast_unsafe_numerics), l2_regularization_penalty_factor is not None), l2_regularizer > 0. ]), _embed_l2_regularization, lambda: (a, b, l2_regularizer)) model_coefficients_next = tf.linalg.lstsq( a, b[..., tf.newaxis], fast=fast_unsafe_numerics, l2_regularizer=l2_regularizer, name='model_coefficients_next') model_coefficients_next = model_coefficients_next[..., 0] # TODO(b/79122261): The approach used in `matrix_solve_ls` could be made # faster by avoiding explicitly forming Q and instead keeping the # factorization in 'implicit' form with stacked (rescaled) Householder # vectors underneath the 'R' and then applying the (accumulated) # reflectors in the appropriate order to apply Q'. However, we don't # presently do this because we lack core TF functionality. For reference, # the vanilla QR approach is: # q, r = tf.linalg.qr(a) # c = tf.matmul(q, b, adjoint_a=True) # model_coefficients_next = tf.matrix_triangular_solve( # r, c, lower=False, name='model_coefficients_next') predicted_linear_response_next = compute_predicted_linear_response( model_matrix, model_coefficients_next, offset, name='predicted_linear_response_next') return model_coefficients_next, predicted_linear_response_next
def test_assert_all_nan_input_numpy_rand(self): a = np.random.rand(10, 10, 10).astype(dtype_util.as_numpy_dtype(self.dtype)) with self.assertRaisesRegexp(AssertionError, 'Arrays are not equal'): self.assertAllNan(a)
def _ones_like(input, dtype=None, name=None): # pylint: disable=redefined-builtin s = _shape(input) s_ = tf.get_static_value(s) if s_ is not None: return np.ones(s_, dtype_util.as_numpy_dtype(dtype or input.dtype)) return tf.ones(s, dtype or s.dtype, name)
def auto_correlation(x, axis=-1, max_lags=None, center=True, normalize=True, name='auto_correlation'): """Auto correlation along one axis. Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation `RXX` may be defined as (with `E` expectation and `Conj` complex conjugate) ``` RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) }, W[n] := (X[n] - MU) / S, MU := E{ X[0] }, S**2 := E{ (X[0] - MU) Conj(X[0] - MU) }. ``` This function takes the viewpoint that `x` is (along one axis) a finite sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an estimate of `RXX[m]` as follows: After extending `x` from length `L` to `inf` by zero padding, the auto correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as ``` rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]), w[n] := (x[n] - mu) / s, mu := L**-1 sum_n x[n], s**2 := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu) ``` The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users often set `max_lags` small enough so that the entire output is meaningful. Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation contains a slight bias, which goes to zero as `len(x) - m --> infinity`. Args: x: `float32` or `complex64` `Tensor`. axis: Python `int`. The axis number along which to compute correlation. Other dimensions index different batch members. max_lags: Positive `int` tensor. The maximum value of `m` to consider (in equation above). If `max_lags >= x.shape[axis]`, we effectively re-set `max_lags` to `x.shape[axis] - 1`. center: Python `bool`. If `False`, do not subtract the mean estimate `mu` from `x[n]` when forming `w[n]`. normalize: Python `bool`. If `False`, do not divide by the variance estimate `s**2` when forming `w[n]`. name: `String` name to prepend to created ops. Returns: `rxx`: `Tensor` of same `dtype` as `x`. `rxx.shape[i] = x.shape[i]` for `i != axis`, and `rxx.shape[axis] = max_lags + 1`. Raises: TypeError: If `x` is not a supported type. """ # Implementation details: # Extend length N / 2 1-D array x to length N by zero padding onto the end. # Then, set # F[x]_k := sum_n x_n exp{-i 2 pi k n / N }. # It is not hard to see that # F[x]_k Conj(F[x]_k) = F[R]_k, where # R_m := sum_n x_n Conj(x_{(n - m) mod N}). # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m]. # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT # based version of estimating RXX. # Note that this is a special case of the Wiener-Khinchin Theorem. with tf.name_scope(name): x = tf.convert_to_tensor(x, name='x') # Rotate dimensions of x in order to put axis at the rightmost dim. # FFT op requires this. rank = ps.rank(x) if axis < 0: axis = rank + axis shift = rank - 1 - axis # Suppose x.shape[axis] = T, so there are T 'time' steps. # ==> x_rotated.shape = B + [T], # where B is x_rotated's batch shape. x_rotated = distribution_util.rotate_transpose(x, shift) if center: x_rotated = x_rotated - tf.reduce_mean( x_rotated, axis=-1, keepdims=True) # x_len = N / 2 from above explanation. The length of x along axis. # Get a value for x_len that works in all cases. x_len = ps.shape(x_rotated)[-1] # TODO(langmore) Investigate whether this zero padding helps or hurts. At # the moment is necessary so that all FFT implementations work. # Zero pad to the next power of 2 greater than 2 * x_len, which equals # 2**(ceil(Log_2(2 * x_len))). Note: Log_2(X) = Log_e(X) / Log_e(2). x_len_float64 = ps.cast(x_len, np.float64) target_length = ps.pow(np.float64(2.), ps.ceil(ps.log(x_len_float64 * 2) / np.log(2.))) pad_length = ps.cast(target_length - x_len_float64, np.int32) # We should have: # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length] # = B + [T + pad_length] x_rotated_pad = distribution_util.pad(x_rotated, axis=-1, back=True, count=pad_length) dtype = x.dtype if not dtype_util.is_complex(dtype): if not dtype_util.is_floating(dtype): raise TypeError( 'Argument x must have either float or complex dtype' ' found: {}'.format(dtype)) x_rotated_pad = tf.complex( x_rotated_pad, dtype_util.as_numpy_dtype(dtype_util.real_dtype(dtype))(0.)) # Autocorrelation is IFFT of power-spectral density (up to some scaling). fft_x_rotated_pad = tf.signal.fft(x_rotated_pad) spectral_density = fft_x_rotated_pad * tf.math.conj(fft_x_rotated_pad) # shifted_product is R[m] from above detailed explanation. # It is the inner product sum_n X[n] * Conj(X[n - m]). shifted_product = tf.signal.ifft(spectral_density) # Cast back to real-valued if x was real to begin with. shifted_product = tf.cast(shifted_product, dtype) # Figure out if we can deduce the final static shape, and set max_lags. # Use x_rotated as a reference, because it has the time dimension in the far # right, and was created before we performed all sorts of crazy shape # manipulations. know_static_shape = True if not tensorshape_util.is_fully_defined(x_rotated.shape): know_static_shape = False if max_lags is None: max_lags = x_len - 1 else: max_lags = tf.convert_to_tensor(max_lags, name='max_lags') max_lags_ = tf.get_static_value(max_lags) if max_lags_ is None or not know_static_shape: know_static_shape = False max_lags = tf.minimum(x_len - 1, max_lags) else: max_lags = min(x_len - 1, max_lags_) # Chop off the padding. # We allow users to provide a huge max_lags, but cut it off here. # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags] shifted_product_chopped = shifted_product[..., :max_lags + 1] # If possible, set shape. if know_static_shape: chopped_shape = tensorshape_util.as_list(x_rotated.shape) chopped_shape[-1] = min(x_len, max_lags + 1) tensorshape_util.set_shape(shifted_product_chopped, chopped_shape) # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]). The # other terms were zeros arising only due to zero padding. # `denominator = (N / 2 - m)` (defined below) is the proper term to # divide by to make this an unbiased estimate of the expectation # E[X[n] Conj(X[n - m])]. x_len = ps.cast(x_len, dtype_util.real_dtype(dtype)) max_lags = ps.cast(max_lags, dtype_util.real_dtype(dtype)) denominator = x_len - ps.range(0., max_lags + 1.) denominator = ps.cast(denominator, dtype) shifted_product_rotated = shifted_product_chopped / denominator if normalize: shifted_product_rotated /= shifted_product_rotated[..., :1] # Transpose dimensions back to those of x. return distribution_util.rotate_transpose(shifted_product_rotated, -shift)
def _log_cdf(self, x): # The CDF is (p**x * (1 - p)**(1 - x) + p - 1) / (2 * p - 1). # We do this computation in logit space to be more numerically stable. # p**x * (1- p)**(1 - x) becomes # 1 / (1 + exp(-logits))**x * # exp(-logits * (1 - x)) / (1 + exp(-logits)) ** (1 - x) = # exp(-logits * (1 - x)) / (1 + exp(-logits)) # p - 1 becomes -exp(-logits) / (1 + exp(-logits)) # Thus the whole numerator is # (exp(-logits * (1 - x)) - exp(-logits)) / (1 + exp(-logits)) # The denominator is (1 - exp(-logits)) / (1 + exp(-logits)) # Putting it all together, this gives: # (exp(-logits * (1 - x)) - exp(-logits)) / (1 - exp(-logits)) = # (exp(logits * x) - 1) / (exp(logits) - 1) logits = self._logits_parameter_no_checks() # For logits < 0, we can directly use the expression. safe_logits = tf.where(logits < 0., logits, -1.) result_negative_logits = ( tfp_math.log1mexp( tf.math.multiply_no_nan(safe_logits, x)) - tfp_math.log1mexp(safe_logits)) # For logits > 0, to avoid infs with large arguments we rewrite the # expression. Let z = log(exp(logits) - 1) # log_cdf = log((exp(logits * x) - 1) / (exp(logits) - 1)) # = log(exp(logits * x) - 1) - log(exp(logits) - 1) # = log(exp(logits * x) - 1) - log(exp(z)) # = log(exp(logits * x - z) - exp(-z)) # Because logits > 0, logits * x - z > -z, so we can pull it out to get # = log(exp(logits * x - z) * (1 - exp(-logits * x))) # = logits * x - z + tf.math.log(1 - exp(-logits * x)) dtype = dtype_util.as_numpy_dtype(x.dtype) eps = np.finfo(dtype).eps # log(exp(logits) - 1) safe_logits = tf.where(logits > 0., logits, 1.) z = tf.where( safe_logits > -np.log(eps), safe_logits, tf.math.log(tf.math.expm1(safe_logits))) result_positive_logits = tf.math.multiply_no_nan( safe_logits, x) - z + tfp_math.log1mexp( -tf.math.multiply_no_nan(safe_logits, x)) result = tf.where( logits < 0., result_negative_logits, result_positive_logits) # Finally, handle the case where `logits` and `p` are on the boundary, # as the above expressions can result in ratio of `infs` in that case as # well. result = tf.where( tf.math.equal(logits, np.inf), dtype(-np.inf), result) result = tf.where( (tf.math.equal(logits, -np.inf) & tf.math.not_equal(x, 0.)) | ( tf.math.equal(logits, np.inf) & tf.math.equal(x, 1.)), tf.zeros_like(logits), result) result = tf.where( x < 0., dtype(-np.inf), tf.where(x > 1., tf.zeros_like(x), result)) return result
def _stddev(self): if self.allow_nan_stats: return tf.fill(self.batch_shape_tensor(), dtype_util.as_numpy_dtype(self.dtype)(np.nan)) else: raise ValueError('`stddev` is undefined for Cauchy distribution.')
def _sample_n(self, n, seed=None): power = tf.convert_to_tensor(self.power) shape = tf.concat([[n], tf.shape(power)], axis=0) has_seed = seed is not None seed = SeedStream(seed, salt='zipf') minval_u = self._hat_integral(0.5, power=power) + 1. maxval_u = self._hat_integral(tf.int64.max - 0.5, power=power) def loop_body(should_continue, k): """Resample the non-accepted points.""" # The range of U is chosen so that the resulting sample K lies in # [0, tf.int64.max). The final sample, if accepted, is K + 1. u = tf.random.uniform( shape, minval=minval_u, maxval=maxval_u, dtype=power.dtype, seed=seed()) # Sample the point X from the continuous density h(x) \propto x^(-power). x = self._hat_integral_inverse(u, power=power) # Rejection-inversion requires a `hat` function, h(x) such that # \int_{k - .5}^{k + .5} h(x) dx >= pmf(k + 1) for points k in the # support. A natural hat function for us is h(x) = x^(-power). # # After sampling X from h(x), suppose it lies in the interval # (K - .5, K + .5) for integer K. Then the corresponding K is accepted if # if lies to the left of x_K, where x_K is defined by: # \int_{x_k}^{K + .5} h(x) dx = H(x_K) - H(K + .5) = pmf(K + 1), # where H(x) = \int_x^inf h(x) dx. # Solving for x_K, we find that x_K = H_inverse(H(K + .5) + pmf(K + 1)). # Or, the acceptance condition is X <= H_inverse(H(K + .5) + pmf(K + 1)). # Since X = H_inverse(U), this simplifies to U <= H(K + .5) + pmf(K + 1). # Update the non-accepted points. # Since X \in (K - .5, K + .5), the sample K is chosen as floor(X + 0.5). k = tf.where(should_continue, tf.floor(x + 0.5), k) accept = (u <= self._hat_integral(k + .5, power=power) + tf.exp( self._log_prob(k + 1, power=power))) return [should_continue & (~accept), k] should_continue, samples = tf.while_loop( cond=lambda should_continue, *ignore: tf.reduce_any(should_continue), body=loop_body, loop_vars=[ tf.ones(shape, dtype=tf.bool), # should_continue tf.zeros(shape, dtype=power.dtype), # k ], parallel_iterations=1 if has_seed else 10, maximum_iterations=self.sample_maximum_iterations, ) samples = samples + 1. if self.validate_args and dtype_util.is_integer(self.dtype): samples = distribution_util.embed_check_integer_casting_closed( samples, target_dtype=self.dtype, assert_positive=True) samples = tf.cast(samples, self.dtype) if self.validate_args: npdt = dtype_util.as_numpy_dtype(self.dtype) v = npdt(dtype_util.min(npdt) if dtype_util.is_integer(npdt) else np.nan) samples = tf.where(should_continue, v, samples) return samples
def _log_loosum_exp_impl(logx, axis, keepdims, compute_mean): """Implementation for `*loosum*` functions.""" with tf.name_scope('log_loosum_exp_impl'): logx = tf.convert_to_tensor(logx, name='logx') dtype = dtype_util.as_numpy_dtype(logx.dtype) if axis is not None: x = np.array(axis) axis = (tf.convert_to_tensor( axis, name='axis', dtype_hint=tf.int32) if x.dtype is np.object else x.astype(np.int32)) log_sum_x = tf.reduce_logsumexp(logx, axis=axis, keepdims=True) # Later we'll want to compute the mean from a sum so we calculate the number # of reduced elements, n. n = prefer_static.size(logx) // prefer_static.size(log_sum_x) n = prefer_static.cast(n, dtype) # log_loosum_x[i] = # = logsumexp(logx[j] : j != i) # = log( exp(logsumexp(logx)) - exp(logx[i]) ) # = log( exp(logsumexp(logx - logx[i])) exp(logx[i]) - exp(logx[i])) # = logx[i] + log(exp(logsumexp(logx - logx[i])) - 1) # = logx[i] + log(exp(logsumexp(logx) - logx[i]) - 1) # = logx[i] + softplus_inverse(logsumexp(logx) - logx[i]) d = log_sum_x - logx # We use `d != 0` rather than `d > 0.` because `d < 0.` should never happen; # if it does we want to complain loudly (which `softplus_inverse` will). d_ok = tf.not_equal(d, 0.) safe_d = tf.where(d_ok, d, 1.) d_ok_result = logx + softplus_inverse(safe_d) neg_inf = tf.constant(-np.inf, dtype=dtype) # When not(d_ok) and is_positive_and_largest then we manually compute the # log_loosum_x. (We can efficiently do this for any one point but not all, # hence we still need the above calculation.) This is good because when # this condition is met, we cannot use the above calculation; its -inf. # We now compute the log-leave-out-max-sum, replicate it to every # point and make sure to select it only when we need to. max_logx = tf.reduce_max(logx, axis=axis, keepdims=True) is_positive_and_largest = (logx > 0.) & tf.equal(logx, max_logx) log_lomsum_x = tf.reduce_logsumexp(tf.where(is_positive_and_largest, neg_inf, logx), axis=axis, keepdims=True) d_not_ok_result = tf.where(is_positive_and_largest, log_lomsum_x, neg_inf) log_loosum_x = tf.where(d_ok, d_ok_result, d_not_ok_result) # We now squeeze log_sum_x so as if we used `keepdims=False`. # TODO(b/136176077): These mental gymnastics could all be replaced with # `tf.squeeze(log_sum_x, axis)` if tf.squeeze supported Tensor valued `axis` # arguments. if not keepdims: if axis is None: keepdims = np.array([], dtype=np.int32) else: rank = prefer_static.rank(logx) keepdims = prefer_static.setdiff1d( prefer_static.range(rank), prefer_static.non_negative_axis(axis, rank)) squeeze_shape = tf.gather(prefer_static.shape(logx), indices=keepdims) log_sum_x = tf.reshape(log_sum_x, shape=squeeze_shape) if prefer_static.is_numpy(keepdims): tensorshape_util.set_shape(log_sum_x, np.array(logx.shape)[keepdims]) # Set static shapes just in case we lost them. tensorshape_util.set_shape(n, []) tensorshape_util.set_shape(log_loosum_x, logx.shape) if not compute_mean: return log_loosum_x, log_sum_x, n log_nm1 = prefer_static.log(max(1., n - 1.)) log_n = prefer_static.log(n) return log_loosum_x - log_nm1, log_sum_x - log_n, n
def _assertions(self, t): if not self.validate_args: return [] return [assert_util.assert_none_equal( t, dtype_util.as_numpy_dtype(t.dtype)(0.), message="All elements must be non-zero.")]
def _variance(self): if self.allow_nan_stats: return tf.fill(self.batch_shape_tensor(), dtype_util.as_numpy_dtype(self.dtype)(np.nan)) raise ValueError( '`variance` is undefined for the half-Cauchy distribution.')
def quadrature_scheme_softmaxnormal_gauss_hermite(normal_loc, normal_scale, quadrature_size, validate_args=False, name=None): """Use Gauss-Hermite quadrature to form quadrature on `K - 1` simplex. A `SoftmaxNormal` random variable `Y` may be generated via ``` Y = SoftmaxCentered(X), X = Normal(normal_loc, normal_scale) ``` Note: for a given `quadrature_size`, this method is generally less accurate than `quadrature_scheme_softmaxnormal_quantiles`. Args: normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0. The location parameter of the Normal used to construct the SoftmaxNormal. normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`. The scale parameter of the Normal used to construct the SoftmaxNormal. quadrature_size: Python `int` scalar representing the number of quadrature points. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. name: Python `str` name prefixed to Ops created by this class. Returns: grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the convex combination of affine parameters for `K` components. `grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex. probs: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the associated with each grid point. """ with tf.name_scope(name or "quadrature_scheme_softmaxnormal_gauss_hermite"): normal_loc = tf.convert_to_tensor(value=normal_loc, name="normal_loc") npdt = dtype_util.as_numpy_dtype(normal_loc.dtype) normal_scale = tf.convert_to_tensor(value=normal_scale, dtype=npdt, name="normal_scale") normal_scale = maybe_check_quadrature_param(normal_scale, "normal_scale", validate_args) grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size) grid = grid.astype(npdt) probs = probs.astype(npdt) probs /= np.linalg.norm(probs, ord=1, keepdims=True) probs = tf.convert_to_tensor(value=probs, name="probs", dtype=npdt) grid = softmax(-distribution_util.pad( (normal_loc[..., tf.newaxis] + np.sqrt(2.) * normal_scale[..., tf.newaxis] * grid), axis=-2, front=True), axis=-2) # shape: [B, components, deg] return grid, probs
def _get_cdf_pdf(c): dtype = dtype_util.as_numpy_dtype(c.dtype) d = normal_lib.Normal(dtype(0), 1) return d.cdf, d.prob
def pinv(a, rcond=None, validate_args=False, name=None): """Compute the Moore-Penrose pseudo-inverse of a matrix. Calculate the [generalized inverse of a matrix]( https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse) using its singular-value decomposition (SVD) and including all large singular values. The pseudo-inverse of a matrix `A`, is defined as: 'the matrix that 'solves' [the least-squares problem] `A @ x = b`,' i.e., if `x_hat` is a solution, then `A_pinv` is the matrix such that `x_hat = A_pinv @ b`. It can be shown that if `U @ Sigma @ V.T = A` is the singular value decomposition of `A`, then `A_pinv = V @ inv(Sigma) U^T`. [(Strang, 1980)][1] This function is analogous to [`numpy.linalg.pinv`]( https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.pinv.html). It differs only in default value of `rcond`. In `numpy.linalg.pinv`, the default `rcond` is `1e-15`. Here the default is `10. * max(num_rows, num_cols) * np.finfo(dtype).eps`. Args: a: (Batch of) `float`-like matrix-shaped `Tensor`(s) which are to be pseudo-inverted. rcond: `Tensor` of small singular value cutoffs. Singular values smaller (in modulus) than `rcond` * largest_singular_value (again, in modulus) are set to zero. Must broadcast against `tf.shape(a)[:-2]`. Default value: `10. * max(num_rows, num_cols) * np.finfo(a.dtype).eps`. validate_args: When `True`, additional assertions might be embedded in the graph. Default value: `False` (i.e., no graph assertions are added). name: Python `str` prefixed to ops created by this function. Default value: 'pinv'. Returns: a_pinv: The pseudo-inverse of input `a`. Has same shape as `a` except rightmost two dimensions are transposed. Raises: TypeError: if input `a` does not have `float`-like `dtype`. ValueError: if input `a` has fewer than 2 dimensions. #### Examples ```python import tensorflow as tf import tensorflow_probability as tfp a = tf.constant([[1., 0.4, 0.5], [0.4, 0.2, 0.25], [0.5, 0.25, 0.35]]) tf.matmul(tfp.math.pinv(a), a) # ==> array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=float32) a = tf.constant([[1., 0.4, 0.5, 1.], [0.4, 0.2, 0.25, 2.], [0.5, 0.25, 0.35, 3.]]) tf.matmul(tfp.math.pinv(a), a) # ==> array([[ 0.76, 0.37, 0.21, -0.02], [ 0.37, 0.43, -0.33, 0.02], [ 0.21, -0.33, 0.81, 0.01], [-0.02, 0.02, 0.01, 1. ]], dtype=float32) ``` #### References [1]: G. Strang. 'Linear Algebra and Its Applications, 2nd Ed.' Academic Press, Inc., 1980, pp. 139-142. """ with tf.name_scope(name or 'pinv'): a = tf.convert_to_tensor(a, name='a') assertions = _maybe_validate_matrix(a, validate_args) if assertions: with tf.control_dependencies(assertions): a = tf.identity(a) dtype = dtype_util.as_numpy_dtype(a.dtype) if rcond is None: def get_dim_size(dim): if tf.compat.dimension_value(a.shape[dim]) is not None: return tf.compat.dimension_value(a.shape[dim]) return tf.shape(a)[dim] num_rows = get_dim_size(-2) num_cols = get_dim_size(-1) if isinstance(num_rows, int) and isinstance(num_cols, int): max_rows_cols = float(max(num_rows, num_cols)) else: max_rows_cols = tf.cast(tf.maximum(num_rows, num_cols), dtype) rcond = 10. * max_rows_cols * np.finfo(dtype).eps rcond = tf.convert_to_tensor(rcond, dtype=dtype, name='rcond') # Calculate pseudo inverse via SVD. # Note: if a is symmetric then u == v. (We might observe additional # performance by explicitly setting `v = u` in such cases.) [ singular_values, # Sigma left_singular_vectors, # U right_singular_vectors, # V ] = tf.linalg.svd(a, full_matrices=False, compute_uv=True) # Saturate small singular values to inf. This has the effect of make # `1. / s = 0.` while not resulting in `NaN` gradients. cutoff = rcond * tf.reduce_max(singular_values, axis=-1) singular_values = tf.where(singular_values > cutoff[..., tf.newaxis], singular_values, np.array(np.inf, dtype)) # Although `a == tf.matmul(u, s * v, transpose_b=True)` we swap # `u` and `v` here so that `tf.matmul(pinv(A), A) = tf.eye()`, i.e., # a matrix inverse has 'transposed' semantics. a_pinv = tf.matmul(right_singular_vectors / singular_values[..., tf.newaxis, :], left_singular_vectors, adjoint_b=True) if tensorshape_util.rank(a.shape) is not None: a_pinv.set_shape(a.shape[:-2].concatenate( [a.shape[-1], a.shape[-2]])) return a_pinv
def _sample_n(self, n, seed=None): if self._use_static_graph: # This sampling approach is almost the same as the approach used by # `MixtureSameFamily`. The differences are due to having a list of # `Distribution` objects rather than a single object, and maintaining # random seed management that is consistent with the non-static code # path. samples = [] cat_samples = self.cat.sample(n, seed=seed) stream = SeedStream(seed, salt='Mixture') for c in range(self.num_components): samples.append(self.components[c].sample(n, seed=stream())) stack_axis = -1 - tensorshape_util.rank(self._static_event_shape) x = tf.stack(samples, axis=stack_axis) # [n, B, k, E] npdt = dtype_util.as_numpy_dtype(x.dtype) mask = tf.one_hot( indices=cat_samples, # [n, B] depth=self._num_components, # == k on_value=npdt(1), off_value=npdt(0)) # [n, B, k] mask = distribution_util.pad_mixture_dimensions( mask, self, self._cat, tensorshape_util.rank( self._static_event_shape)) # [n, B, k, [1]*e] return tf.reduce_sum(x * mask, axis=stack_axis) # [n, B, E] n = tf.convert_to_tensor(n, name='n') static_n = tf.get_static_value(n) n = int(static_n) if static_n is not None else n cat_samples = self.cat.sample(n, seed=seed) static_samples_shape = cat_samples.shape if tensorshape_util.is_fully_defined(static_samples_shape): samples_shape = tensorshape_util.as_list(static_samples_shape) samples_size = tensorshape_util.num_elements(static_samples_shape) else: samples_shape = tf.shape(cat_samples) samples_size = tf.size(cat_samples) static_batch_shape = self.batch_shape if tensorshape_util.is_fully_defined(static_batch_shape): batch_shape = tensorshape_util.as_list(static_batch_shape) batch_size = tensorshape_util.num_elements(static_batch_shape) else: batch_shape = tf.shape(cat_samples)[1:] batch_size = tf.reduce_prod(batch_shape) static_event_shape = self.event_shape if tensorshape_util.is_fully_defined(static_event_shape): event_shape = np.array( tensorshape_util.as_list(static_event_shape), dtype=np.int32) else: event_shape = None # Get indices into the raw cat sampling tensor. We will # need these to stitch sample values back out after sampling # within the component partitions. samples_raw_indices = tf.reshape(tf.range(0, samples_size), samples_shape) # Partition the raw indices so that we can use # dynamic_stitch later to reconstruct the samples from the # known partitions. partitioned_samples_indices = tf.dynamic_partition( data=samples_raw_indices, partitions=cat_samples, num_partitions=self.num_components) # Copy the batch indices n times, as we will need to know # these to pull out the appropriate rows within the # component partitions. batch_raw_indices = tf.reshape(tf.tile(tf.range(0, batch_size), [n]), samples_shape) # Explanation of the dynamic partitioning below: # batch indices are i.e., [0, 1, 0, 1, 0, 1] # Suppose partitions are: # [1 1 0 0 1 1] # After partitioning, batch indices are cut as: # [batch_indices[x] for x in 2, 3] # [batch_indices[x] for x in 0, 1, 4, 5] # i.e. # [1 1] and [0 0 0 0] # Now we sample n=2 from part 0 and n=4 from part 1. # For part 0 we want samples from batch entries 1, 1 (samples 0, 1), # and for part 1 we want samples from batch entries 0, 0, 0, 0 # (samples 0, 1, 2, 3). partitioned_batch_indices = tf.dynamic_partition( data=batch_raw_indices, partitions=cat_samples, num_partitions=self.num_components) samples_class = [None for _ in range(self.num_components)] stream = SeedStream(seed, salt='Mixture') for c in range(self.num_components): n_class = tf.size(partitioned_samples_indices[c]) samples_class_c = self.components[c].sample(n_class, seed=stream()) if event_shape is None: batch_ndims = prefer_static.rank_from_shape(batch_shape) event_shape = tf.shape(samples_class_c)[1 + batch_ndims:] # Pull out the correct batch entries from each index. # To do this, we may have to flatten the batch shape. # For sample s, batch element b of component c, we get the # partitioned batch indices from # partitioned_batch_indices[c]; and shift each element by # the sample index. The final lookup can be thought of as # a matrix gather along locations (s, b) in # samples_class_c where the n_class rows correspond to # samples within this component and the batch_size columns # correspond to batch elements within the component. # # Thus the lookup index is # lookup[c, i] = batch_size * s[i] + b[c, i] # for i = 0 ... n_class[c] - 1. lookup_partitioned_batch_indices = ( batch_size * tf.range(n_class) + partitioned_batch_indices[c]) samples_class_c = tf.reshape( samples_class_c, tf.concat([[n_class * batch_size], event_shape], 0)) samples_class_c = tf.gather(samples_class_c, lookup_partitioned_batch_indices, name='samples_class_c_gather') samples_class[c] = samples_class_c # Stitch back together the samples across the components. lhs_flat_ret = tf.dynamic_stitch(indices=partitioned_samples_indices, data=samples_class) # Reshape back to proper sample, batch, and event shape. ret = tf.reshape(lhs_flat_ret, tf.concat([samples_shape, event_shape], 0)) tensorshape_util.set_shape( ret, tensorshape_util.concatenate(static_samples_shape, self.event_shape)) return ret
def __init__(self, loc, scale, validate_args=False, allow_nan_stats=True, name="Gumbel"): """Construct Gumbel distributions with location and scale `loc` and `scale`. The parameters `loc` and `scale` must be shaped in a way that supports broadcasting (e.g. `loc + scale` is a valid operation). Args: loc: Floating point tensor, the means of the distribution(s). scale: Floating point tensor, the scales of the distribution(s). scale must contain only positive values. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False`. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. Default value: `True`. name: Python `str` name prefixed to Ops created by this class. Default value: `'Gumbel'`. Raises: TypeError: if loc and scale are different dtypes. """ parameters = dict(locals()) with tf.name_scope(name) as name: dtype = dtype_util.common_dtype([loc, scale], preferred_dtype=tf.float32) loc = tf.convert_to_tensor(value=loc, name="loc", dtype=dtype) scale = tf.convert_to_tensor(value=scale, name="scale", dtype=dtype) with tf.control_dependencies( [assert_util.assert_positive(scale)] if validate_args else []): loc = tf.identity(loc, name="loc") scale = tf.identity(scale, name="scale") tf.debugging.assert_same_float_dtype([loc, scale]) self._gumbel_bijector = gumbel_bijector.Gumbel( loc=loc, scale=scale, validate_args=validate_args) # Because the uniform sampler generates samples in `[0, 1)` this would # cause samples to lie in `(inf, -inf]` instead of `(inf, -inf)`. To fix # this, we use `np.finfo(dtype_util.as_numpy_dtype(self.dtype).tiny` # because it is the smallest, positive, "normal" number. super(Gumbel, self).__init__( distribution=uniform.Uniform(low=np.finfo( dtype_util.as_numpy_dtype(dtype)).tiny, high=tf.ones([], dtype=loc.dtype), allow_nan_stats=allow_nan_stats), # The Gumbel bijector encodes the quantile # function as the forward, and hence needs to # be inverted. bijector=invert_bijector.Invert(self._gumbel_bijector), batch_shape=distribution_util.get_broadcast_shape(loc, scale), parameters=parameters, name=name)
def _sample_n(self, n, seed=None): dim0_seed, otherdims_seed = samplers.split_seed(seed, salt='von_mises_fisher') # The sampling strategy relies on the fact that vMF variates are symmetric # about the mean direction. Accordingly, if we have a sampling strategy for # the away-from-mean angle, then we can uniformly sample the remaining # dimensions on the S^{dim-2} sphere for , and rotate these samples from a # (1, 0, 0, ..., 0)-mode distribution into the target orientation. # # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a # von-Mises distributed `x` value in [-1, 1], then uniformly select what # amounts to a "up" or "down" additional degree of freedom after unit # normalizing, followed by a final rotation to the desired mean direction # from a basis of (1, 0). # # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the # unit sphere over which the distribution is uniform, in particular the # circle where x = \hat{x} intersects the unit sphere. We pick a point on # that circle, then rotate to the desired mean direction from a basis of # (1, 0, 0). mean_direction = tf.convert_to_tensor(self.mean_direction) concentration = tf.convert_to_tensor(self.concentration) event_dim = ( tf.compat.dimension_value(self.event_shape[0]) or self._event_shape_tensor(mean_direction=mean_direction)[0]) sample_batch_shape = ps.concat([[n], self._batch_shape_tensor( mean_direction=mean_direction, concentration=concentration)], axis=0) dim = tf.cast(event_dim - 1, self.dtype) if event_dim == 3: samples_dim0 = self._sample_3d(n, mean_direction=mean_direction, concentration=concentration, seed=dim0_seed) else: # Wood'94 provides a rejection algorithm to sample the x coordinate. # Wood'94 definition of b: # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim # https://stats.stackexchange.com/questions/156729 suggests: b = dim / (2 * concentration + tf.sqrt(4 * concentration**2 + dim**2)) # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE # https://github.com/nicola-decao/s-vae-tf/ x = (1 - b) / (1 + b) c = concentration * x + dim * tf.math.log1p(-x**2) beta = beta_lib.Beta(dim / 2, dim / 2) def cond_fn(w, should_continue, seed): del w, seed return tf.reduce_any(should_continue) def body_fn(w, should_continue, seed): """While loop body for sampling the angle `w`.""" beta_seed, unif_seed, next_seed = samplers.split_seed(seed, n=3) z = beta.sample(sample_shape=sample_batch_shape, seed=beta_seed) # set_shape needed here because of b/139013403 tensorshape_util.set_shape(z, w.shape) w = tf.where(should_continue, (1. - (1. + b) * z) / (1. - (1. - b) * z), w) if not self.allow_nan_stats: w = tf.debugging.check_numerics(w, 'w') unif = samplers.uniform( sample_batch_shape, seed=unif_seed, dtype=self.dtype) # set_shape needed here because of b/139013403 tensorshape_util.set_shape(unif, w.shape) should_continue = should_continue & ( concentration * w + dim * tf.math.log1p(-x * w) - c < # Use log1p(-unif) to prevent log(0) and ensure that log(1) is # possible. tf.math.log1p(-unif)) return w, should_continue, next_seed w = tf.zeros(sample_batch_shape, dtype=self.dtype) should_continue = tf.ones(sample_batch_shape, dtype=tf.bool) samples_dim0, _, _ = tf.while_loop( cond=cond_fn, body=body_fn, loop_vars=(w, should_continue, dim0_seed)) samples_dim0 = samples_dim0[..., tf.newaxis] if not self._allow_nan_stats: # Verify samples are w/in -1, 1, with useful error output tensors (top # value rather than all values). with tf.control_dependencies([ assert_util.assert_less_equal( samples_dim0, dtype_util.as_numpy_dtype(self.dtype)(1.01)), assert_util.assert_greater_equal( samples_dim0, dtype_util.as_numpy_dtype(self.dtype)(-1.01)), ]): samples_dim0 = tf.identity(samples_dim0) samples_otherdims_shape = ps.concat([sample_batch_shape, [event_dim - 1]], axis=0) unit_otherdims = tf.math.l2_normalize( samplers.normal( samples_otherdims_shape, seed=otherdims_seed, dtype=self.dtype), axis=-1) samples = tf.concat([ samples_dim0, # we must avoid sqrt(1 - (>1)**2) tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims ], axis=-1) samples = tf.math.l2_normalize(samples, axis=-1) if not self.allow_nan_stats: samples = tf.debugging.check_numerics(samples, 'samples') # Runtime assert that samples are unit length. if not self.allow_nan_stats: worst, _ = tf.math.top_k( tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1])) with tf.control_dependencies([ assert_util.assert_near( dtype_util.as_numpy_dtype(self.dtype)(0), worst, atol=1e-4, summarize=100) ]): samples = tf.identity(samples) # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0). # Now, we move the mode to `self.mean_direction` using a rotation matrix. if not self.allow_nan_stats: # Assert that the basis vector rotates to the mean direction, as expected. basis = tf.cast(tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0), self.dtype) with tf.control_dependencies([ assert_util.assert_less( tf.linalg.norm( self._rotate(basis, mean_direction=mean_direction) - mean_direction, axis=-1), dtype_util.as_numpy_dtype(self.dtype)(1e-5)) ]): return self._rotate(samples, mean_direction=mean_direction) return self._rotate(samples, mean_direction=mean_direction)
def _numpy_dtype(dtype): if dtype is None: return None return dtype_util.as_numpy_dtype(dtype)
def _variance(self): if self.allow_nan_stats: return tf.fill(self.batch_shape_tensor(), dtype_util.as_numpy_dtype(self.dtype)(np.nan)) raise ValueError("`variance` is undefined for Horseshoe distribution.")
def test_assert_all_nan_input_placeholder_with_default(self): all_nan = np.full((10, 10, 10), np.nan).astype(dtype_util.as_numpy_dtype(self.dtype)) a = tf1.placeholder_with_default(all_nan, shape=all_nan.shape) self.assertAllNan(a)
def _potential_scale_reduction_single_state(state, independent_chain_ndims, split_chains, validate_args): """potential_scale_reduction for one single state `Tensor`.""" # casting integers to floats for floating-point division # check to see if the `state` is a numpy object for the numpy test suite if dtype_util.as_numpy_dtype(state.dtype) is np.int64: state = tf.cast(state, tf.float64) elif dtype_util.is_integer(state.dtype): state = tf.cast(state, tf.float32) with tf.name_scope('potential_scale_reduction_single_state'): # We assume exactly one leading dimension indexes e.g. correlated samples # from each Markov chain. state = tf.convert_to_tensor(state, name='state') n_samples_ = tf.compat.dimension_value(state.shape[0]) if n_samples_ is not None: # If available statically. if split_chains and n_samples_ < 4: raise ValueError( 'Must provide at least 4 samples when splitting chains. ' 'Found {}'.format(n_samples_)) if not split_chains and n_samples_ < 2: raise ValueError( 'Must provide at least 2 samples. Found {}'.format( n_samples_)) elif validate_args: if split_chains: assertions = [ assert_util.assert_greater( ps.shape(state)[0], 4, message= 'Must provide at least 4 samples when splitting chains.' ) ] with tf.control_dependencies(assertions): state = tf.identity(state) else: assertions = [ assert_util.assert_greater( ps.shape(state)[0], 2, message='Must provide at least 2 samples.') ] with tf.control_dependencies(assertions): state = tf.identity(state) # Define so it's not a magic number. # Warning! `if split_chains` logic assumes this is 1! sample_ndims = 1 if split_chains: # Split the sample dimension in half, doubling the number of # independent chains. # For odd number of samples, keep all but the last sample. state_shape = ps.shape(state) n_samples = state_shape[0] state = state[:n_samples - n_samples % 2] # Suppose state = [0, 1, 2, 3, 4, 5] # Step 1: reshape into [[0, 1, 2], [3, 4, 5]] # E.g. reshape states of shape [a, b] into [2, a//2, b]. state = tf.reshape( state, ps.concat([[2, n_samples // 2], state_shape[1:]], axis=0)) # Step 2: Put the size `2` dimension in the right place to be treated as a # chain, changing [[0, 1, 2], [3, 4, 5]] into [[0, 3], [1, 4], [2, 5]], # reshaping [2, a//2, b] into [a//2, 2, b]. state = tf.transpose( a=state, perm=ps.concat([[1, 0], tf.range(2, tf.rank(state))], axis=0)) # We're treating the new dim as indexing 2 chains, so increment. independent_chain_ndims += 1 sample_axis = tf.range(0, sample_ndims) chain_axis = tf.range(sample_ndims, sample_ndims + independent_chain_ndims) sample_and_chain_axis = tf.range( 0, sample_ndims + independent_chain_ndims) n = _axis_size(state, sample_axis) m = _axis_size(state, chain_axis) # In the language of Brooks and Gelman (1998), # B / n is the between chain variance, the variance of the chain means. # W is the within sequence variance, the mean of the chain variances. b_div_n = _reduce_variance(tf.reduce_mean(state, axis=sample_axis, keepdims=True), sample_and_chain_axis, biased=False) w = tf.reduce_mean(_reduce_variance(state, sample_axis, keepdims=True, biased=False), axis=sample_and_chain_axis) # sigma^2_+ is an estimate of the true variance, which would be unbiased if # each chain was drawn from the target. c.f. "law of total variance." sigma_2_plus = ((n - 1) / n) * w + b_div_n return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
def mask_if_invalid(x, mask): return tf.where( is_valid, x, np.array(mask, dtype_util.as_numpy_dtype(x.dtype)))
def _forward_log_det_jacobian(self, x): # Let Y be a symmetric, positive definite matrix and write: # Y = X X.T # where X is lower-triangular. # # Observe that, # dY[i,j]/dX[a,b] # = d/dX[a,b] { X[i,:] X[j,:] } # = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] } # # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is # symmetric and X is lower-triangular, we need vectors of dimension: # d = p (p + 1) / 2 # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e., # k = { i (i + 1) / 2 + j i>=j # { undef i<j # and assume zero-based indexes. When k is undef, the element is dropped. # Example: # j k # 0 1 2 3 / # 0 [ 0 . . . ] # i 1 [ 1 2 . . ] # 2 [ 3 4 5 . ] # 3 [ 6 7 8 9 ] # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With # slight abuse: k(i,j)=undef means the element is dropped.) # # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b. # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since: # (1) j<=i<a thus i,j!=a. # (2) i=a>j thus i,j!=a. # # Since the Jacobian is lower-triangular, we need only compute the product # of diagonal elements: # d vec[Y] / d vec[X] @[k(i,j), k(i,j)] # = X[j,j] + I[i=j] X[i,j] # = 2 X[j,j]. # Since there is a 2 X[j,j] term for every lower-triangular element of X we # conclude: # |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}. diag = tf.linalg.diag_part(x) # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the # output is unchanged. diag = self._make_columnar(diag) with tf.control_dependencies(self._assertions(x)): # Create a vector equal to: [p, p-1, ..., 2, 1]. if tf.compat.dimension_value(x.shape[-1]) is None: p_int = tf.shape(x)[-1] p_float = tf.cast(p_int, dtype=x.dtype) else: p_int = tf.compat.dimension_value(x.shape[-1]) p_float = dtype_util.as_numpy_dtype(x.dtype)(p_int) exponents = tf.linspace(p_float, 1., p_int) sum_weighted_log_diag = tf.squeeze( tf.matmul(tf.math.log(diag), exponents[..., tf.newaxis]), axis=-1) fldj = p_float * np.log(2.) + sum_weighted_log_diag # We finally need to undo adding an extra column in non-scalar cases # where there is a single matrix as input. if tensorshape_util.rank(x.shape) is not None: if tensorshape_util.rank(x.shape) == 2: fldj = tf.squeeze(fldj, axis=-1) return fldj shape = tf.shape(fldj) maybe_squeeze_shape = tf.concat([ shape[:-1], distribution_util.pick_vector( tf.equal(tf.rank(x), 2), np.array([], dtype=np.int32), shape[-1:])], 0) return tf.reshape(fldj, maybe_squeeze_shape)