def _variance(self): var = (tf.square(self.scale) * (1. + ( self._standardized_low * self._normal_pdf(self._standardized_low) - self._standardized_high * self._normal_pdf(self._standardized_high) ) / self._normalizer - tf.square( (self._normal_pdf(self._standardized_low) - self._normal_pdf(self._standardized_high)) / self._normalizer))) return var
def normal_conjugates_known_scale_posterior(prior, scale, s, n): """Posterior Normal distribution with conjugate prior on the mean. This model assumes that `n` observations (with sum `s`) come from a Normal with unknown mean `loc` (described by the Normal `prior`) and known variance `scale**2`. The "known scale posterior" is the distribution of the unknown `loc`. Accepts a prior Normal distribution object, having parameters `loc0` and `scale0`, as well as known `scale` values of the predictive distribution(s) (also assumed Normal), and statistical estimates `s` (the sum(s) of the observations) and `n` (the number(s) of observations). Returns a posterior (also Normal) distribution object, with parameters `(loc', scale'**2)`, where: ``` mu ~ N(mu', sigma'**2) sigma'**2 = 1/(1/sigma0**2 + n/sigma**2), mu' = (mu0/sigma0**2 + s/sigma**2) * sigma'**2. ``` Distribution parameters from `prior`, as well as `scale`, `s`, and `n`. will broadcast in the case of multidimensional sets of parameters. Args: prior: `Normal` object of type `dtype`: the prior distribution having parameters `(loc0, scale0)`. scale: tensor of type `dtype`, taking values `scale > 0`. The known stddev parameter(s). s: Tensor of type `dtype`. The sum(s) of observations. n: Tensor of type `int`. The number(s) of observations. Returns: A new Normal posterior distribution object for the unknown observation mean `loc`. Raises: TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a Normal object. """ if not isinstance(prior, normal.Normal): raise TypeError("Expected prior to be an instance of type Normal") if s.dtype != prior.dtype: raise TypeError( "Observation sum s.dtype does not match prior dtype: %s vs. %s" % (s.dtype, prior.dtype)) n = tf.cast(n, prior.dtype) scale0_2 = tf.square(prior.scale) scale_2 = tf.square(scale) scalep_2 = 1.0 / (1 / scale0_2 + n / scale_2) return normal.Normal(loc=(prior.loc / scale0_2 + s / scale_2) * scalep_2, scale=tf.sqrt(scalep_2))
def _log_ndtr_lower(x, series_order): """Asymptotic expansion version of `Log[cdf(x)]`, appropriate for `x<<-1`.""" x_2 = tf.square(x) # Log of the term multiplying (1 + sum) log_scale = -0.5 * x_2 - tf.math.log(-x) - 0.5 * np.log(2. * np.pi) return log_scale + tf.math.log(_log_ndtr_asymptotic_series( x, series_order))
def log1psquare(x, name=None): """Numerically stable calculation of `log(1 + x**2)` for small or large `|x|`. For sufficiently large `x` we use the following observation: ```none log(1 + x**2) = 2 log(|x|) + log(1 + 1 / x**2) --> 2 log(|x|) as x --> inf ``` Numerically, `log(1 + 1 / x**2)` is `0` when `1 / x**2` is small relative to machine epsilon. Args: x: Float `Tensor` input. name: Python string indicating the name of the TensorFlow operation. Default value: `'log1psquare'`. Returns: log1psq: Float `Tensor` representing `log(1. + x**2.)`. """ with tf.name_scope(name or 'log1psquare'): x = tf.convert_to_tensor(x, dtype_hint=tf.float32, name='x') dtype = dtype_util.as_numpy_dtype(x.dtype) eps = np.finfo(dtype).eps.astype(np.float64) is_large = tf.abs(x) > (eps**-0.5).astype(dtype) # Mask out small x's so the gradient correctly propagates. abs_large_x = tf.where(is_large, tf.abs(x), tf.ones([], x.dtype)) return tf.where(is_large, 2. * tf.math.log(abs_large_x), tf.math.log1p(tf.square(x)))
def squared_frobenius_norm(x): """Helper to make KL calculation slightly more readable.""" # http://mathworld.wolfram.com/FrobeniusNorm.html # The gradient of KL[p,q] is not defined when p==q. The culprit is # tf.norm, i.e., we cannot use the commented out code. # return tf.square(tf.norm(x, ord="fro", axis=[-2, -1])) return tf.reduce_sum(tf.square(x), axis=[-2, -1])
def _variance(self): concentration = tf.convert_to_tensor(self.concentration) scale = tf.convert_to_tensor(self.scale) var = (tf.square(scale) / tf.square(concentration - 1.) / (concentration - 2.)) if self.allow_nan_stats: assertions = [] else: assertions = [ assert_util.assert_less( tf.constant(2., dtype=self.dtype), concentration, message='variance undefined when any concentration <= 2') ] with tf.control_dependencies(assertions): return tf.where(concentration > 2., var, dtype_util.as_numpy_dtype(self.dtype)(np.nan))
def _variance(self): # Because df is a scalar, we need to expand dimensions to match # scale_operator. We use ellipses notation (...) to select all dimensions # and add two dimensions to the end. df = self.df[..., tf.newaxis, tf.newaxis] x = tf.sqrt(df) * self._square_scale_operator() d = tf.expand_dims(tf.linalg.diag_part(x), -1) v = tf.square(x) + tf.matmul(d, d, adjoint_b=True) return v
def _variance(self): if distribution_util.is_diagonal_scale(self.scale): return tf.square(self.scale.diag_part()) elif (isinstance(self.scale, tf.linalg.LinearOperatorLowRankUpdate) and self.scale.is_self_adjoint): return tf.linalg.diag_part(self.scale.matmul(self.scale.to_dense())) else: return tf.linalg.diag_part( self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
def _covariance(self): # Let # W = (w1,...,wk), with wj ~ iid Exponential(0, 1). # Then this distribution is # X = loc + LW, # and then since Cov(wi, wj) = 1 if i=j, and 0 otherwise, # Cov(X) = L Cov(W W^T) L^T = L L^T. if distribution_util.is_diagonal_scale(self.scale): return tf.linalg.diag(tf.square(self.scale.diag_part())) else: return self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
def _covariance(self): if distribution_util.is_diagonal_scale(self.scale): mvn_cov = tf.linalg.diag(tf.square(self.scale.diag_part())) else: mvn_cov = self.scale.matmul(self.scale.to_dense(), adjoint_arg=True) cov_shape = tf.concat( [self._sample_shape(), self._event_shape_tensor()], -1) mvn_cov = tf.broadcast_to(mvn_cov, cov_shape) return self._std_var_helper(mvn_cov, 'covariance', 2, lambda x: x)
def _variance(self): if distribution_util.is_diagonal_scale(self.scale): mvn_var = tf.square(self.scale.diag_part()) elif (isinstance(self.scale, tf.linalg.LinearOperatorLowRankUpdate) and self.scale.is_self_adjoint): mvn_var = tf.linalg.diag_part( self.scale.matmul(self.scale.to_dense())) else: mvn_var = tf.linalg.diag_part( self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)) mvn_var = tf.broadcast_to(mvn_var, self._sample_shape()) return self._std_var_helper(mvn_var, 'variance', 1, lambda x: x)
def _covariance(self): # Let # W = (w1,...,wk), with wj ~ iid Laplace(0, 1). # Then this distribution is # X = loc + LW, # and since E[X] = loc, # Cov(X) = E[LW W^T L^T] = L E[W W^T] L^T. # Since E[wi wj] = 0 if i != j, and 2 if i == j, we have # Cov(X) = 2 LL^T if distribution_util.is_diagonal_scale(self.scale): return 2. * tf.linalg.diag(tf.square(self.scale.diag_part())) else: return 2. * self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
def _log_ndtr_asymptotic_series(x, series_order): """Calculates the asymptotic series used in log_ndtr.""" npdt = dtype_util.as_numpy_dtype(x.dtype) if series_order <= 0: return npdt(1) x_2 = tf.square(x) even_sum = tf.zeros_like(x) odd_sum = tf.zeros_like(x) x_2n = x_2 # Start with x^{2*1} = x^{2*n} with n = 1. for n in range(1, series_order + 1): y = npdt(_double_factorial(2 * n - 1)) / x_2n if n % 2: odd_sum += y else: even_sum += y x_2n *= x_2 return 1. + even_sum - odd_sum
def _variance(self): concentration = tf.convert_to_tensor(self.concentration) mixing_concentration = tf.convert_to_tensor(self.mixing_concentration) mixing_rate = tf.convert_to_tensor(self.mixing_rate) variance = (tf.square(concentration * mixing_rate / (mixing_concentration - 1.)) / (mixing_concentration - 2.)) if self.allow_nan_stats: return tf.where(mixing_concentration > 2., variance, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) else: with tf.control_dependencies([ assert_util.assert_less( tf.ones([], self.dtype) * 2., mixing_concentration, message= 'variance undefined when `mixing_concentration` <= 2') ]): return tf.identity(variance)
def _variance(self): df = tf.convert_to_tensor(self.df) scale = tf.convert_to_tensor(self.scale) # We need to put the tf.where inside the outer tf.where to ensure we never # hit a NaN in the gradient. denom = tf.where(df > 2., df - 2., tf.ones_like(df)) # Abs(scale) superfluous. var = (tf.ones(self._batch_shape_tensor(df=df, scale=scale), dtype=self.dtype) * tf.square(scale) * df / denom) # When 1 < df <= 2, variance is infinite. result_where_defined = tf.where( df > 2., var, dtype_util.as_numpy_dtype(self.dtype)(np.inf)) if self.allow_nan_stats: return tf.where(df > 1., result_where_defined, dtype_util.as_numpy_dtype(self.dtype)(np.nan)) else: return distribution_util.with_dependencies([ assert_util.assert_less( tf.ones([], dtype=self.dtype), df, message='variance not defined for components of df <= 1'), ], result_where_defined)
def _mean_of_covariance_given_quadrature_component(self, diag_only): p = self.mixture_distribution.probs_parameter() # To compute E[Cov(Z|V)], we'll add matrices within three categories: # scaled-identity, diagonal, and full. Then we'll combine these at the end. scale_identity_multiplier = None diag = None full = None for k, aff in enumerate(self.interpolated_affine): s = aff.scale # Just in case aff.scale has side-effects, we'll call once. if (s is None or isinstance(s, tf.linalg.LinearOperatorIdentity)): scale_identity_multiplier = add(scale_identity_multiplier, p[..., k, tf.newaxis]) elif isinstance(s, tf.linalg.LinearOperatorScaledIdentity): scale_identity_multiplier = add( scale_identity_multiplier, (p[..., k, tf.newaxis] * tf.square(s.multiplier))) elif isinstance(s, tf.linalg.LinearOperatorDiag): diag = add(diag, (p[..., k, tf.newaxis] * tf.square(s.diag_part()))) else: x = (p[..., k, tf.newaxis, tf.newaxis] * s.matmul(s.to_dense(), adjoint_arg=True)) if diag_only: x = tf.linalg.diag_part(x) full = add(full, x) # We must now account for the fact that the base distribution might have a # non-unity variance. Recall that, since X ~ iid Law(X_0), # `Cov(SX+m) = S Cov(X) S.T = S S.T Diag(Var(X_0))`. # We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid # samples from a scalar-event distribution. v = self.distribution.variance() if scale_identity_multiplier is not None: scale_identity_multiplier = scale_identity_multiplier * v if diag is not None: diag = diag * v[..., tf.newaxis] if full is not None: full = full * v[..., tf.newaxis] if diag_only: # Apparently we don't need the full matrix, just the diagonal. r = add(diag, full) if r is None and scale_identity_multiplier is not None: ones = tf.ones(self.event_shape_tensor(), dtype=self.dtype) return scale_identity_multiplier[..., tf.newaxis] * ones return add(r, scale_identity_multiplier) # `None` indicates we don't know if the result is positive-definite. is_positive_definite = (True if all( aff.scale.is_positive_definite for aff in self.endpoint_affine) else None) to_add = [] if diag is not None: to_add.append( tf.linalg.LinearOperatorDiag( diag=diag, is_positive_definite=is_positive_definite)) if full is not None: to_add.append( tf.linalg.LinearOperatorFullMatrix( matrix=full, is_positive_definite=is_positive_definite)) if scale_identity_multiplier is not None: to_add.append( tf.linalg.LinearOperatorScaledIdentity( num_rows=self.event_shape_tensor()[0], multiplier=scale_identity_multiplier, is_positive_definite=is_positive_definite)) return (linop_add_lib.add_operators(to_add)[0].to_dense() if to_add else None)
def _stddev(self): samples = tf.convert_to_tensor(self._samples) axis = self._samples_axis r = samples - tf.expand_dims(self._mean(samples), axis=axis) var = tf.reduce_mean(tf.square(r), axis=axis) return tf.sqrt(var)
def _log_prob(self, x): scale = tf.convert_to_tensor(self.scale) log_unnormalized_prob = -tf.math.log1p( tf.square(self._z(x, scale=scale))) log_normalization = np.log(np.pi) + tf.math.log(scale) return log_unnormalized_prob - log_normalization
def _forward_log_det_jacobian(self, x): return -0.5 * np.log(2 * np.pi) - tf.square(x) / 2.
def _log_prob(self, x): if self.input_output_cholesky: x_sqrt = x else: # Complexity: O(nbk**3) x_sqrt = tf.linalg.cholesky(x) batch_shape = self.batch_shape_tensor() event_shape = self.event_shape_tensor() x_ndims = tf.rank(x_sqrt) num_singleton_axes_to_prepend = ( tf.maximum(tf.size(batch_shape) + 2, x_ndims) - x_ndims) x_with_prepended_singletons_shape = tf.concat([ tf.ones([num_singleton_axes_to_prepend], dtype=tf.int32), tf.shape(x_sqrt) ], 0) x_sqrt = tf.reshape(x_sqrt, x_with_prepended_singletons_shape) ndims = tf.rank(x_sqrt) # sample_ndims = ndims - batch_ndims - event_ndims sample_ndims = ndims - tf.size(batch_shape) - 2 sample_shape = tf.shape(x_sqrt)[:sample_ndims] # We need to be able to pre-multiply each matrix by its corresponding # batch scale matrix. Since a Distribution Tensor supports multiple # samples per batch, this means we need to reshape the input matrix `x` # so that the first b dimensions are batch dimensions and the last two # are of shape [dimension, dimensions*number_of_samples]. Doing these # gymnastics allows us to do a batch_solve. # # After we're done with sqrt_solve (the batch operation) we need to undo # this reshaping so what we're left with is a Tensor partitionable by # sample, batch, event dimensions. # Complexity: O(nbk**2) since transpose must access every element. scale_sqrt_inv_x_sqrt = x_sqrt perm = tf.concat( [tf.range(sample_ndims, ndims), tf.range(0, sample_ndims)], 0) scale_sqrt_inv_x_sqrt = tf.transpose(a=scale_sqrt_inv_x_sqrt, perm=perm) last_dim_size = ( tf.cast(self.dimension, dtype=tf.int32) * tf.reduce_prod(x_with_prepended_singletons_shape[:sample_ndims])) shape = tf.concat([ x_with_prepended_singletons_shape[sample_ndims:-2], [tf.cast(self.dimension, dtype=tf.int32), last_dim_size] ], axis=0) scale_sqrt_inv_x_sqrt = tf.reshape(scale_sqrt_inv_x_sqrt, shape) # Complexity: O(nbM*k) where M is the complexity of the operator solving a # vector system. For LinearOperatorLowerTriangular, each solve is O(k**2) so # this step has complexity O(nbk^3). scale_sqrt_inv_x_sqrt = self.scale_operator.solve( scale_sqrt_inv_x_sqrt) # Undo make batch-op ready. # Complexity: O(nbk**2) shape = tf.concat( [tf.shape(scale_sqrt_inv_x_sqrt)[:-2], event_shape, sample_shape], axis=0) scale_sqrt_inv_x_sqrt = tf.reshape(scale_sqrt_inv_x_sqrt, shape) perm = tf.concat([ tf.range(ndims - sample_ndims, ndims), tf.range(0, ndims - sample_ndims) ], 0) scale_sqrt_inv_x_sqrt = tf.transpose(a=scale_sqrt_inv_x_sqrt, perm=perm) # Write V = SS', X = LL'. Then: # tr[inv(V) X] = tr[inv(S)' inv(S) L L'] # = tr[inv(S) L L' inv(S)'] # = tr[(inv(S) L) (inv(S) L)'] # = sum_{ik} (inv(S) L)_{ik}**2 # The second equality follows from the cyclic permutation property. # Complexity: O(nbk**2) trace_scale_inv_x = tf.reduce_sum(tf.square(scale_sqrt_inv_x_sqrt), axis=[-2, -1]) # Complexity: O(nbk) half_log_det_x = tf.reduce_sum(tf.math.log( tf.linalg.diag_part(x_sqrt)), axis=[-1]) # Complexity: O(nbk**2) log_prob = ((self.df - self.dimension - 1.) * half_log_det_x - 0.5 * trace_scale_inv_x - self.log_normalization()) # Set shape hints. # Try to merge what we know from the input x with what we know from the # parameters of this distribution. if tensorshape_util.rank( x.shape) is not None and tensorshape_util.rank( self.batch_shape) is not None: tensorshape_util.set_shape( log_prob, tf.broadcast_static_shape(x.shape[:-2], self.batch_shape)) return log_prob
def _variance(self): return self.concentration / tf.square(self.rate)
def _covariance(self): if distribution_util.is_diagonal_scale(self.scale): return tf.linalg.diag(tf.square(self.scale.diag_part())) else: return self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
def _normal_pdf(self, x): return 1. / np.sqrt(2 * np.pi) * tf.exp(-0.5 * tf.square(x))
def _variance(self): return tf.square(self.range()) / 12.
def _forward(self, x): with tf.control_dependencies(self._assertions(x)): return tf.square(x)