def _log_unnormalized_prob(self, x, log_rate): # The log-probability at negative points is always -inf. # Catch such x's and set the output value accordingly. safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x), 0.) y = safe_x * log_rate - tf.math.lgamma(1. + safe_x) return tf.where( tf.equal(x, safe_x), y, dtype_util.as_numpy_dtype(y.dtype)(-np.inf))
def _log_prob(self, x, power=None): # The log probability at positive integer points x is log(x^(-power) / Z) # where Z is the normalization constant. For x < 1 and non-integer points, # the log-probability is -inf. # # However, if interpolate_nondiscrete is True, we return the natural # continuous relaxation for x >= 1 which agrees with the log probability at # positive integer points. # # If interpolate_nondiscrete is False and validate_args is True, we check # that the sample point x is in the support. That is, x is equivalent to a # positive integer. power = power if power is not None else tf.convert_to_tensor( self.power) x = tf.cast(x, power.dtype) if self.validate_args and not self.interpolate_nondiscrete: x = distribution_util.embed_check_integer_casting_closed( x, target_dtype=self.dtype, assert_positive=True) log_normalization = tf.math.log(tf.math.zeta(power, 1.)) safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x), 1.) y = -power * tf.math.log(safe_x) log_unnormalized_prob = tf.where( tf.equal(x, safe_x), y, dtype_util.as_numpy_dtype(y.dtype)(-np.inf)) return log_unnormalized_prob - log_normalization
def _inverse(self, y): map_values = tf.convert_to_tensor(self.map_values) flat_y = tf.reshape(y, shape=[-1]) # Search for the indices of map_values that are closest to flat_y. # Since map_values is strictly increasing, the closest is either the # first one that is strictly greater than flat_y, or the one before it. upper_candidates = tf.minimum( tf.size(map_values) - 1, tf.searchsorted(map_values, values=flat_y, side='right')) lower_candidates = tf.maximum(0, upper_candidates - 1) candidates = tf.stack([lower_candidates, upper_candidates], axis=-1) lower_cand_diff = tf.abs(flat_y - self._forward(lower_candidates)) upper_cand_diff = tf.abs(flat_y - self._forward(upper_candidates)) if self.validate_args: with tf.control_dependencies([ assert_util.assert_near(tf.minimum(lower_cand_diff, upper_cand_diff), 0, message='inverse value not found') ]): candidates = tf.identity(candidates) candidate_selector = tf.stack([ tf.range(tf.size(flat_y), dtype=tf.int32), tf.argmin([lower_cand_diff, upper_cand_diff], output_type=tf.int32) ], axis=-1) return tf.reshape(tf.gather_nd(candidates, candidate_selector), shape=y.shape)
def calculate_reshape(original_shape, new_shape, validate=False, name=None): """Calculates the reshaped dimensions (replacing up to one -1 in reshape).""" batch_shape_static = tensorshape_util.constant_value_as_shape(new_shape) if tensorshape_util.is_fully_defined(batch_shape_static): return np.int32(batch_shape_static), batch_shape_static, [] with tf.name_scope(name or 'calculate_reshape'): original_size = tf.reduce_prod(original_shape) implicit_dim = tf.equal(new_shape, -1) size_implicit_dim = (original_size // tf.maximum(1, -tf.reduce_prod(new_shape))) expanded_new_shape = tf.where( # Assumes exactly one `-1`. implicit_dim, size_implicit_dim, new_shape) validations = [] if not validate else [ # pylint: disable=g-long-ternary assert_util.assert_rank( original_shape, 1, message='Original shape must be a vector.'), assert_util.assert_rank( new_shape, 1, message='New shape must be a vector.'), assert_util.assert_less_equal( tf.math.count_nonzero(implicit_dim, dtype=tf.int32), 1, message='At most one dimension can be unknown.'), assert_util.assert_positive( expanded_new_shape, message='Shape elements must be >=-1.'), assert_util.assert_equal(tf.reduce_prod(expanded_new_shape), original_size, message='Shape sizes do not match.'), ] return expanded_new_shape, batch_shape_static, validations
def _cdf(self, x): # CDF is the probability that the Poisson variable is less or equal to x. # For fractional x, the CDF is equal to the CDF at n = floor(x). # For negative x, the CDF is zero, but tf.igammac gives NaNs, so we impute # the values and handle this case explicitly. safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x), 0.) cdf = tf.math.igammac(1. + safe_x, self._rate_parameter_no_checks()) return tf.where(x < 0., tf.zeros_like(cdf), cdf)
def _cdf(self, x): # CDF(x) at positive integer x is the probability that the Zipf variable is # less than or equal to x; given by the formula: # CDF(x) = 1 - (zeta(power, x + 1) / Z) # For fractional x, the CDF is equal to the CDF at n = floor(x). # For x < 1, the CDF is zero. # If interpolate_nondiscrete is True, we return a continuous relaxation # which agrees with the CDF at integer points. power = tf.convert_to_tensor(self.power) x = tf.cast(x, power.dtype) safe_x = tf.maximum(x if self.interpolate_nondiscrete else tf.floor(x), 0.) cdf = 1. - (tf.math.zeta(power, safe_x + 1.) / tf.math.zeta(power, 1.)) return tf.where(x < 1., tf.zeros_like(cdf), cdf)
def log_add_exp(x, y, name=None): """Computes `log(exp(x) + exp(y))` in a numerically stable way. Args: x: `float` `Tensor` broadcastable with `y`. y: `float` `Tensor` broadcastable with `x`. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., `'log_add_exp'`). Returns: log_add_exp: `log(exp(x) + exp(y))` computed in a numerically stable way. """ with tf.name_scope(name or 'log_add_exp'): dtype = dtype_util.common_dtype([x, y], dtype_hint=tf.float32) x = tf.convert_to_tensor(x, dtype=dtype, name='x') y = tf.convert_to_tensor(y, dtype=dtype, name='y') return tf.maximum(x, y) + tf.math.softplus(-abs(x - y))
def _log_prob(self, x): x = tf.convert_to_tensor(x, name='x') right_indices = tf.minimum( tf.size(self.outcomes) - 1, tf.reshape( tf.searchsorted(self.outcomes, values=tf.reshape(x, shape=[-1]), side='right'), dist_util.prefer_static_shape(x))) use_right_indices = self._is_equal_or_close( x, tf.gather(self.outcomes, indices=right_indices)) left_indices = tf.maximum(0, right_indices - 1) use_left_indices = self._is_equal_or_close( x, tf.gather(self.outcomes, indices=left_indices)) log_probs = self._categorical.log_prob( tf.where(use_left_indices, left_indices, right_indices)) return tf.where(tf.logical_not(use_left_indices | use_right_indices), dtype_util.as_numpy_dtype(log_probs.dtype)(-np.inf), log_probs)
def _sample_n(self, n, seed=None): seed = SeedStream(seed, salt='vom_mises_fisher') # The sampling strategy relies on the fact that vMF variates are symmetric # about the mean direction. Accordingly, if we have a sampling strategy for # the away-from-mean angle, then we can uniformly sample the remaining # dimensions on the S^{dim-2} sphere for , and rotate these samples from a # (1, 0, 0, ..., 0)-mode distribution into the target orientation. # # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a # von-Mises distributed `x` value in [-1, 1], then uniformly select what # amounts to a "up" or "down" additional degree of freedom after unit # normalizing, followed by a final rotation to the desired mean direction # from a basis of (1, 0). # # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the # unit sphere over which the distribution is uniform, in particular the # circle where x = \hat{x} intersects the unit sphere. We pick a point on # that circle, then rotate to the desired mean direction from a basis of # (1, 0, 0). event_dim = (tf.compat.dimension_value(self.event_shape[0]) or self._event_shape_tensor()[0]) sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()], axis=0) dim = tf.cast(event_dim - 1, self.dtype) if event_dim == 3: samples_dim0 = self._sample_3d(n, seed=seed) else: # Wood'94 provides a rejection algorithm to sample the x coordinate. # Wood'94 definition of b: # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim # https://stats.stackexchange.com/questions/156729 suggests: b = dim / (2 * self.concentration + tf.sqrt(4 * self.concentration**2 + dim**2)) # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE # https://github.com/nicola-decao/s-vae-tf/ x = (1 - b) / (1 + b) c = self.concentration * x + dim * tf.math.log1p(-x**2) beta = beta_lib.Beta(dim / 2, dim / 2) def cond_fn(w, should_continue): del w return tf.reduce_any(should_continue) def body_fn(w, should_continue): z = beta.sample(sample_shape=sample_batch_shape, seed=seed()) # set_shape needed here because of b/139013403 z.set_shape(w.shape) w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w) w = tf.debugging.check_numerics(w, 'w') unif = tf.random.uniform(sample_batch_shape, seed=seed(), dtype=self.dtype) # set_shape needed here because of b/139013403 unif.set_shape(w.shape) should_continue = tf.logical_and( should_continue, self.concentration * w + dim * tf.math.log1p(-x * w) - c < tf.math.log(unif)) return w, should_continue w = tf.zeros(sample_batch_shape, dtype=self.dtype) should_continue = tf.ones(sample_batch_shape, dtype=tf.bool) samples_dim0 = tf.while_loop(cond=cond_fn, body=body_fn, loop_vars=(w, should_continue))[0] samples_dim0 = samples_dim0[..., tf.newaxis] if not self._allow_nan_stats: # Verify samples are w/in -1, 1, with useful error output tensors (top # value rather than all values). with tf.control_dependencies([ assert_util.assert_less_equal( samples_dim0, dtype_util.as_numpy_dtype(self.dtype)(1.01), data=[ tf.math.top_k(tf.reshape(samples_dim0, [-1]))[0] ]), assert_util.assert_greater_equal( samples_dim0, dtype_util.as_numpy_dtype(self.dtype)(-1.01), data=[ -tf.math.top_k(tf.reshape(-samples_dim0, [-1]))[0] ]) ]): samples_dim0 = tf.identity(samples_dim0) samples_otherdims_shape = tf.concat( [sample_batch_shape, [event_dim - 1]], axis=0) unit_otherdims = tf.math.l2_normalize(tf.random.normal( samples_otherdims_shape, seed=seed(), dtype=self.dtype), axis=-1) samples = tf.concat( [ samples_dim0, # we must avoid sqrt(1 - (>1)**2) tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims ], axis=-1) samples = tf.math.l2_normalize(samples, axis=-1) if not self._allow_nan_stats: samples = tf.debugging.check_numerics(samples, 'samples') # Runtime assert that samples are unit length. if not self._allow_nan_stats: worst, idx = tf.math.top_k( tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1])) with tf.control_dependencies([ assert_util.assert_near( dtype_util.as_numpy_dtype(self.dtype)(0), worst, data=[ worst, idx, tf.gather(tf.reshape(samples, [-1, event_dim]), idx) ], atol=1e-4, summarize=100) ]): samples = tf.identity(samples) # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0). # Now, we move the mode to `self.mean_direction` using a rotation matrix. if not self._allow_nan_stats: # Assert that the basis vector rotates to the mean direction, as expected. basis = tf.cast( tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0), self.dtype) with tf.control_dependencies([ assert_util.assert_less( tf.linalg.norm(self._rotate(basis) - self.mean_direction, axis=-1), dtype_util.as_numpy_dtype(self.dtype)(1e-5)) ]): return self._rotate(samples) return self._rotate(samples)
def pinv(a, rcond=None, validate_args=False, name=None): """Compute the Moore-Penrose pseudo-inverse of a matrix. Calculate the [generalized inverse of a matrix]( https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse) using its singular-value decomposition (SVD) and including all large singular values. The pseudo-inverse of a matrix `A`, is defined as: 'the matrix that 'solves' [the least-squares problem] `A @ x = b`,' i.e., if `x_hat` is a solution, then `A_pinv` is the matrix such that `x_hat = A_pinv @ b`. It can be shown that if `U @ Sigma @ V.T = A` is the singular value decomposition of `A`, then `A_pinv = V @ inv(Sigma) U^T`. [(Strang, 1980)][1] This function is analogous to [`numpy.linalg.pinv`]( https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.pinv.html). It differs only in default value of `rcond`. In `numpy.linalg.pinv`, the default `rcond` is `1e-15`. Here the default is `10. * max(num_rows, num_cols) * np.finfo(dtype).eps`. Args: a: (Batch of) `float`-like matrix-shaped `Tensor`(s) which are to be pseudo-inverted. rcond: `Tensor` of small singular value cutoffs. Singular values smaller (in modulus) than `rcond` * largest_singular_value (again, in modulus) are set to zero. Must broadcast against `tf.shape(a)[:-2]`. Default value: `10. * max(num_rows, num_cols) * np.finfo(a.dtype).eps`. validate_args: When `True`, additional assertions might be embedded in the graph. Default value: `False` (i.e., no graph assertions are added). name: Python `str` prefixed to ops created by this function. Default value: 'pinv'. Returns: a_pinv: The pseudo-inverse of input `a`. Has same shape as `a` except rightmost two dimensions are transposed. Raises: TypeError: if input `a` does not have `float`-like `dtype`. ValueError: if input `a` has fewer than 2 dimensions. #### Examples ```python from tensorflow_probability.python.internal.backend import numpy as tf import tensorflow_probability as tfp; tfp = tfp.experimental.substrates.numpy a = tf.constant([[1., 0.4, 0.5], [0.4, 0.2, 0.25], [0.5, 0.25, 0.35]]) tf.matmul(tfp.math.pinv(a), a) # ==> array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=float32) a = tf.constant([[1., 0.4, 0.5, 1.], [0.4, 0.2, 0.25, 2.], [0.5, 0.25, 0.35, 3.]]) tf.matmul(tfp.math.pinv(a), a) # ==> array([[ 0.76, 0.37, 0.21, -0.02], [ 0.37, 0.43, -0.33, 0.02], [ 0.21, -0.33, 0.81, 0.01], [-0.02, 0.02, 0.01, 1. ]], dtype=float32) ``` #### References [1]: G. Strang. 'Linear Algebra and Its Applications, 2nd Ed.' Academic Press, Inc., 1980, pp. 139-142. """ with tf.name_scope(name or 'pinv'): a = tf.convert_to_tensor(a, name='a') assertions = _maybe_validate_matrix(a, validate_args) if assertions: with tf.control_dependencies(assertions): a = tf.identity(a) dtype = dtype_util.as_numpy_dtype(a.dtype) if rcond is None: def get_dim_size(dim): if tf.compat.dimension_value(a.shape[dim]) is not None: return tf.compat.dimension_value(a.shape[dim]) return tf.shape(a)[dim] num_rows = get_dim_size(-2) num_cols = get_dim_size(-1) if isinstance(num_rows, int) and isinstance(num_cols, int): max_rows_cols = float(max(num_rows, num_cols)) else: max_rows_cols = tf.cast(tf.maximum(num_rows, num_cols), dtype) rcond = 10. * max_rows_cols * np.finfo(dtype).eps rcond = tf.convert_to_tensor(rcond, dtype=dtype, name='rcond') # Calculate pseudo inverse via SVD. # Note: if a is symmetric then u == v. (We might observe additional # performance by explicitly setting `v = u` in such cases.) [ singular_values, # Sigma left_singular_vectors, # U right_singular_vectors, # V ] = tf.linalg.svd(a, full_matrices=False, compute_uv=True) # Saturate small singular values to inf. This has the effect of make # `1. / s = 0.` while not resulting in `NaN` gradients. cutoff = rcond * tf.reduce_max(singular_values, axis=-1) singular_values = tf.where(singular_values > cutoff[..., tf.newaxis], singular_values, np.array(np.inf, dtype)) # Although `a == tf.matmul(u, s * v, transpose_b=True)` we swap # `u` and `v` here so that `tf.matmul(pinv(A), A) = tf.eye()`, i.e., # a matrix inverse has 'transposed' semantics. a_pinv = tf.matmul(right_singular_vectors / singular_values[..., tf.newaxis, :], left_singular_vectors, adjoint_b=True) if tensorshape_util.rank(a.shape) is not None: a_pinv.set_shape(a.shape[:-2].concatenate( [a.shape[-1], a.shape[-2]])) return a_pinv
def _log_prob(self, x): if self.input_output_cholesky: x_sqrt = x else: # Complexity: O(nbk**3) x_sqrt = tf.linalg.cholesky(x) batch_shape = self.batch_shape_tensor() event_shape = self.event_shape_tensor() x_ndims = tf.rank(x_sqrt) num_singleton_axes_to_prepend = ( tf.maximum(tf.size(batch_shape) + 2, x_ndims) - x_ndims) x_with_prepended_singletons_shape = tf.concat([ tf.ones([num_singleton_axes_to_prepend], dtype=tf.int32), tf.shape(x_sqrt) ], 0) x_sqrt = tf.reshape(x_sqrt, x_with_prepended_singletons_shape) ndims = tf.rank(x_sqrt) # sample_ndims = ndims - batch_ndims - event_ndims sample_ndims = ndims - tf.size(batch_shape) - 2 sample_shape = tf.shape(x_sqrt)[:sample_ndims] # We need to be able to pre-multiply each matrix by its corresponding # batch scale matrix. Since a Distribution Tensor supports multiple # samples per batch, this means we need to reshape the input matrix `x` # so that the first b dimensions are batch dimensions and the last two # are of shape [dimension, dimensions*number_of_samples]. Doing these # gymnastics allows us to do a batch_solve. # # After we're done with sqrt_solve (the batch operation) we need to undo # this reshaping so what we're left with is a Tensor partitionable by # sample, batch, event dimensions. # Complexity: O(nbk**2) since transpose must access every element. scale_sqrt_inv_x_sqrt = x_sqrt perm = tf.concat( [tf.range(sample_ndims, ndims), tf.range(0, sample_ndims)], 0) scale_sqrt_inv_x_sqrt = tf.transpose(a=scale_sqrt_inv_x_sqrt, perm=perm) last_dim_size = ( tf.cast(self.dimension, dtype=tf.int32) * tf.reduce_prod(x_with_prepended_singletons_shape[:sample_ndims])) shape = tf.concat([ x_with_prepended_singletons_shape[sample_ndims:-2], [tf.cast(self.dimension, dtype=tf.int32), last_dim_size] ], axis=0) scale_sqrt_inv_x_sqrt = tf.reshape(scale_sqrt_inv_x_sqrt, shape) # Complexity: O(nbM*k) where M is the complexity of the operator solving a # vector system. For LinearOperatorLowerTriangular, each solve is O(k**2) so # this step has complexity O(nbk^3). scale_sqrt_inv_x_sqrt = self.scale_operator.solve( scale_sqrt_inv_x_sqrt) # Undo make batch-op ready. # Complexity: O(nbk**2) shape = tf.concat( [tf.shape(scale_sqrt_inv_x_sqrt)[:-2], event_shape, sample_shape], axis=0) scale_sqrt_inv_x_sqrt = tf.reshape(scale_sqrt_inv_x_sqrt, shape) perm = tf.concat([ tf.range(ndims - sample_ndims, ndims), tf.range(0, ndims - sample_ndims) ], 0) scale_sqrt_inv_x_sqrt = tf.transpose(a=scale_sqrt_inv_x_sqrt, perm=perm) # Write V = SS', X = LL'. Then: # tr[inv(V) X] = tr[inv(S)' inv(S) L L'] # = tr[inv(S) L L' inv(S)'] # = tr[(inv(S) L) (inv(S) L)'] # = sum_{ik} (inv(S) L)_{ik}**2 # The second equality follows from the cyclic permutation property. # Complexity: O(nbk**2) trace_scale_inv_x = tf.reduce_sum(tf.square(scale_sqrt_inv_x_sqrt), axis=[-2, -1]) # Complexity: O(nbk) half_log_det_x = tf.reduce_sum(tf.math.log( tf.linalg.diag_part(x_sqrt)), axis=[-1]) # Complexity: O(nbk**2) log_prob = ((self.df - self.dimension - 1.) * half_log_det_x - 0.5 * trace_scale_inv_x - self.log_normalization()) # Set shape hints. # Try to merge what we know from the input x with what we know from the # parameters of this distribution. if tensorshape_util.rank( x.shape) is not None and tensorshape_util.rank( self.batch_shape) is not None: tensorshape_util.set_shape( log_prob, tf.broadcast_static_shape(x.shape[:-2], self.batch_shape)) return log_prob
def soft_threshold(x, threshold, name=None): """Soft Thresholding operator. This operator is defined by the equations ```none { x[i] - gamma, x[i] > gamma SoftThreshold(x, gamma)[i] = { 0, x[i] == gamma { x[i] + gamma, x[i] < -gamma ``` In the context of proximal gradient methods, we have ```none SoftThreshold(x, gamma) = prox_{gamma L1}(x) ``` where `prox` is the proximity operator. Thus the soft thresholding operator is used in proximal gradient descent for optimizing a smooth function with (non-smooth) L1 regularization, as outlined below. The proximity operator is defined as: ```none prox_r(x) = argmin{ r(z) + 0.5 ||x - z||_2**2 : z }, ``` where `r` is a (weakly) convex function, not necessarily differentiable. Because the L2 norm is strictly convex, the above argmin is unique. One important application of the proximity operator is as follows. Let `L` be a convex and differentiable function with Lipschitz-continuous gradient. Let `R` be a convex lower semicontinuous function which is possibly nondifferentiable. Let `gamma` be an arbitrary positive real. Then ```none x_star = argmin{ L(x) + R(x) : x } ``` if and only if the fixed-point equation is satisfied: ```none x_star = prox_{gamma R}(x_star - gamma grad L(x_star)) ``` Proximal gradient descent thus typically consists of choosing an initial value `x^{(0)}` and repeatedly applying the update ```none x^{(k+1)} = prox_{gamma^{(k)} R}(x^{(k)} - gamma^{(k)} grad L(x^{(k)})) ``` where `gamma` is allowed to vary from iteration to iteration. Specializing to the case where `R(x) = ||x||_1`, we minimize `L(x) + ||x||_1` by repeatedly applying the update ``` x^{(k+1)} = SoftThreshold(x - gamma grad L(x^{(k)}), gamma) ``` (This idea can also be extended to second-order approximations, although the multivariate case does not have a known closed form like above.) Args: x: `float` `Tensor` representing the input to the SoftThreshold function. threshold: nonnegative scalar, `float` `Tensor` representing the radius of the interval on which each coordinate of SoftThreshold takes the value zero. Denoted `gamma` above. name: Python string indicating the name of the TensorFlow operation. Default value: `'soft_threshold'`. Returns: softthreshold: `float` `Tensor` with the same shape and dtype as `x`, representing the value of the SoftThreshold function. #### References [1]: Yu, Yao-Liang. The Proximity Operator. https://www.cs.cmu.edu/~suvrit/teach/yaoliang_proximity.pdf [2]: Wikipedia Contributors. Proximal gradient methods for learning. _Wikipedia, The Free Encyclopedia_, 2018. https://en.wikipedia.org/wiki/Proximal_gradient_methods_for_learning """ # https://math.stackexchange.com/questions/471339/derivation-of-soft-thresholding-operator with tf.name_scope(name or 'soft_threshold'): x = tf.convert_to_tensor(x, name='x') threshold = tf.convert_to_tensor(threshold, dtype=x.dtype, name='threshold') return tf.sign(x) * tf.maximum(tf.abs(x) - threshold, 0.)
def _validate_sample_arg(self, x): """Helper which validates sample arg, e.g., input to `log_prob`.""" with tf.name_scope('validate_sample_arg'): x_ndims = (tf.rank(x) if tensorshape_util.rank(x.shape) is None else tensorshape_util.rank(x.shape)) event_ndims = (tf.size(self.event_shape_tensor()) if tensorshape_util.rank(self.event_shape) is None else tensorshape_util.rank(self.event_shape)) batch_ndims = (tf.size(self._batch_shape_unexpanded) if tensorshape_util.rank(self.batch_shape) is None else tensorshape_util.rank(self.batch_shape)) expected_batch_event_ndims = batch_ndims + event_ndims if (isinstance(x_ndims, int) and isinstance(expected_batch_event_ndims, int)): if x_ndims < expected_batch_event_ndims: raise NotImplementedError( 'Broadcasting is not supported; too few batch and event dims ' '(expected at least {}, saw {}).'.format( expected_batch_event_ndims, x_ndims)) ndims_assertion = [] elif self.validate_args: ndims_assertion = [ assert_util.assert_greater_equal( x_ndims, expected_batch_event_ndims, message=('Broadcasting is not supported; too few ' 'batch and event dims.'), name='assert_batch_and_event_ndims_large_enough'), ] if (tensorshape_util.is_fully_defined(self.batch_shape) and tensorshape_util.is_fully_defined(self.event_shape)): expected_batch_event_shape = np.int32( tensorshape_util.concatenate(self.batch_shape, self.event_shape)) else: expected_batch_event_shape = tf.concat([ self.batch_shape_tensor(), self.event_shape_tensor(), ], axis=0) sample_ndims = x_ndims - expected_batch_event_ndims if isinstance(sample_ndims, int): sample_ndims = max(sample_ndims, 0) if (isinstance(sample_ndims, int) and tensorshape_util.is_fully_defined(x.shape[sample_ndims:])): actual_batch_event_shape = np.int32(x.shape[sample_ndims:]) else: sample_ndims = tf.maximum(sample_ndims, 0) actual_batch_event_shape = tf.shape(x)[sample_ndims:] if (isinstance(expected_batch_event_shape, np.ndarray) and isinstance(actual_batch_event_shape, np.ndarray)): if any(expected_batch_event_shape != actual_batch_event_shape): raise NotImplementedError( 'Broadcasting is not supported; ' 'unexpected batch and event shape ' '(expected {}, saw {}).'.format( expected_batch_event_shape, actual_batch_event_shape)) # We need to set the final runtime-assertions to `ndims_assertion` since # its possible this assertion was created. We could add a condition to # only do so if `self.validate_args == True`, however this is redundant # as `ndims_assertion` already encodes this information. runtime_assertions = ndims_assertion elif self.validate_args: # We need to make the `ndims_assertion` a control dep because otherwise # TF itself might raise an exception owing to this assertion being # ill-defined, ie, one cannot even compare different rank Tensors. with tf.control_dependencies(ndims_assertion): shape_assertion = assert_util.assert_equal( expected_batch_event_shape, actual_batch_event_shape, message=('Broadcasting is not supported; ' 'unexpected batch and event shape.'), name='assert_batch_and_event_shape_same') runtime_assertions = [shape_assertion] else: runtime_assertions = [] return runtime_assertions
def log_ndtr(x, series_order=3, name="log_ndtr"): """Log Normal distribution function. For details of the Normal distribution function see `ndtr`. This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or using an asymptotic series. Specifically: - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on `log(1-x) ~= -x, x << 1`. - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique and take a log. - For `x <= lower_segment`, we use the series approximation of erf to compute the log CDF directly. The `lower_segment` is set based on the precision of the input: ``` lower_segment = { -20, x.dtype=float64 { -10, x.dtype=float32 upper_segment = { 8, x.dtype=float64 { 5, x.dtype=float32 ``` When `x < lower_segment`, the `ndtr` asymptotic series approximation is: ``` ndtr(x) = scale * (1 + sum) + R_N scale = exp(-0.5 x**2) / (-x sqrt(2 pi)) sum = Sum{(-1)^n (2n-1)!! / (x**2)^n, n=1:N} R_N = O(exp(-0.5 x**2) (2N+1)!! / |x|^{2N+3}) ``` where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ... (3) (1)` is a [double-factorial](https://en.wikipedia.org/wiki/Double_factorial). Args: x: `Tensor` of type `float32`, `float64`. series_order: Positive Python `integer`. Maximum depth to evaluate the asymptotic expansion. This is the `N` above. name: Python string. A name for the operation (default="log_ndtr"). Returns: log_ndtr: `Tensor` with `dtype=x.dtype`. Raises: TypeError: if `x.dtype` is not handled. TypeError: if `series_order` is a not Python `integer.` ValueError: if `series_order` is not in `[0, 30]`. """ if not isinstance(series_order, int): raise TypeError("series_order must be a Python integer.") if series_order < 0: raise ValueError("series_order must be non-negative.") if series_order > 30: raise ValueError("series_order must be <= 30.") with tf.name_scope(name): x = tf.convert_to_tensor(x, name="x") if dtype_util.base_equal(x.dtype, tf.float64): lower_segment = LOGNDTR_FLOAT64_LOWER upper_segment = LOGNDTR_FLOAT64_UPPER elif dtype_util.base_equal(x.dtype, tf.float32): lower_segment = LOGNDTR_FLOAT32_LOWER upper_segment = LOGNDTR_FLOAT32_UPPER else: raise TypeError("x.dtype=%s is not supported." % x.dtype) # The basic idea here was ported from: # https://root.cern.ch/doc/v608/SpecFuncCephesInv_8cxx_source.html # We copy the main idea, with a few changes # * For x >> 1, and X ~ Normal(0, 1), # Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x], # which extends the range of validity of this function. # * We use one fixed series_order for all of 'x', rather than adaptive. # * Our docstring properly reflects that this is an asymptotic series, not a # Taylor series. We also provided a correct bound on the remainder. # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when # x=0. This happens even though the branch is unchosen because when x=0 # the gradient of a select involves the calculation 1*dy+0*(-inf)=nan # regardless of whether dy is finite. Note that the minimum is a NOP if # the branch is chosen. return tf.where( x > upper_segment, -_ndtr(-x), # log(1-x) ~= -x, x << 1 tf.where( x > lower_segment, tf.math.log(_ndtr(tf.maximum(x, lower_segment))), _log_ndtr_lower(tf.minimum(x, lower_segment), series_order)))