def __init__(self, loc): self._loc = tf.convert_to_tensor(loc) super(StatefulNormal, self).__init__( dtype=tf.float32, reparameterization_type=tfd.FULLY_REPARAMETERIZED, validate_args=False, allow_nan_stats=False)
def log_average_probs(logits, sample_axis=0, event_axis=None, keepdims=False, validate_args=False, name=None): """Computes `log(average(to_probs(logits)))` in a numerically stable manner. The meaning of `to_probs` is controlled by the `event_axis` argument. When `event_axis` is `None`, `to_probs = tf.math.sigmoid` and otherwise `to_probs = lambda x: tf.math.log_softmax(x, axis=event_axis)`. `sample_axis` and `event_axis` should have a null intersection. This requirement is always verified when `validate_args` is `True`. Args: logits: A `float` `Tensor` representing logits. sample_axis: Scalar or vector `Tensor` designating axis holding samples, or `None` (meaning all axis hold samples). Default value: `0` (leftmost dimension). event_axis: Scalar or vector `Tensor` designating the axis representing categorical logits. Default value: `None` (i.e., Bernoulli logits). keepdims: Boolean. Whether to keep the sample axis as singletons. Default value: `False` (i.e., squeeze the reduced dimensions). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. Default value: `False` (i.e., do not validate args). name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., `'log_average_probs'`). Returns: log_avg_probs: The natural log of the average of probs computed from logits. """ with tf.name_scope(name or 'average_sigmoid'): logits = tf.convert_to_tensor(logits, dtype_hint=tf.float32, name='logits') if sample_axis is not None: sample_axis = tf.convert_to_tensor(sample_axis, dtype_hint=tf.int32, name='sample_axis') if event_axis is not None: event_axis = tf.convert_to_tensor(event_axis, dtype_hint=tf.int32, name='event_axis') if event_axis is None: # log(sigmoid(x)) = log(1 / (1 + exp(-x))) = -log1p(exp(-x)) = -sp(-x) log_probs = -tf.math.softplus(-logits) else: sample_axis, event_axis = _log_average_probs_process_args( logits, validate_args, sample_axis, event_axis) with tf.control_dependencies( _log_average_probs_maybe_check_args( sample_axis, event_axis, validate_args)): log_probs = _log_softmax(logits, axis=event_axis) return reduce_logmeanexp(log_probs, axis=sample_axis, keepdims=keepdims)
def reduce_weighted_logsumexp(logx, w=None, axis=None, keep_dims=False, return_sign=False, name=None): """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`. If all weights `w` are known to be positive, it is more efficient to directly use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more efficient than `du.reduce_weighted_logsumexp(logx, w)`. Reduces `input_tensor` along the dimensions given in `axis`. Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in `axis`. If `keep_dims` is true, the reduced dimensions are retained with length 1. If `axis` has no entries, all dimensions are reduced, and a tensor with a single element is returned. This function is more numerically stable than log(sum(w * exp(input))). It avoids overflows caused by taking the exp of large inputs and underflows caused by taking the log of small inputs. For example: ```python x = tf.constant([[0., 0, 0], [0, 0, 0]]) w = tf.constant([[-1., 1, 1], [1, 1, 1]]) du.reduce_weighted_logsumexp(x, w) # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4) du.reduce_weighted_logsumexp(x, w, axis=0) # ==> [log(-1+1), log(1+1), log(1+1)] du.reduce_weighted_logsumexp(x, w, axis=1) # ==> [log(-1+1+1), log(1+1+1)] du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True) # ==> [[log(-1+1+1)], [log(1+1+1)]] du.reduce_weighted_logsumexp(x, w, axis=[0, 1]) # ==> log(-1+5) ``` Args: logx: The tensor to reduce. Should have numeric type. w: The weight tensor. Should have numeric type identical to `logx`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. keep_dims: If true, retains reduced dimensions with length 1. return_sign: If `True`, returns the sign of the result. name: A name for the operation (optional). Returns: lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor. sign: (Optional) The sign of `sum(weight * exp(x))`. """ with tf.name_scope(name or 'reduce_weighted_logsumexp'): logx = tf.convert_to_tensor(logx, name='logx') if w is None: lswe = tf.reduce_logsumexp(logx, axis=axis, keepdims=keep_dims) if return_sign: sgn = tf.ones_like(lswe) return lswe, sgn return lswe w = tf.convert_to_tensor(w, dtype=logx.dtype, name='w') log_absw_x = logx + tf.math.log(tf.abs(w)) max_log_absw_x = tf.reduce_max(log_absw_x, axis=axis, keepdims=True) # If the largest element is `-inf` or `inf` then we don't bother subtracting # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That # this is ok follows from the fact that we're actually free to subtract any # value we like, so long as we add it back after taking the `log(sum(...))`. max_log_absw_x = tf.where(tf.math.is_inf(max_log_absw_x), tf.zeros([], max_log_absw_x.dtype), max_log_absw_x) wx_over_max_absw_x = (tf.sign(w) * tf.exp(log_absw_x - max_log_absw_x)) sum_wx_over_max_absw_x = tf.reduce_sum(wx_over_max_absw_x, axis=axis, keepdims=keep_dims) if not keep_dims: max_log_absw_x = tf.squeeze(max_log_absw_x, axis) sgn = tf.sign(sum_wx_over_max_absw_x) lswe = max_log_absw_x + tf.math.log(sgn * sum_wx_over_max_absw_x) if return_sign: return lswe, sgn return lswe
def __init__(self, num_timesteps, design_matrix, drift_scale, initial_state_prior, observation_noise_scale=0., initial_step=0, validate_args=False, allow_nan_stats=True, name=None): """State space model for a dynamic linear regression. Args: num_timesteps: Scalar `int` `Tensor` number of timesteps to model with this distribution. design_matrix: float `Tensor` of shape `concat([batch_shape, [num_timesteps, num_features]])`. drift_scale: Scalar (any additional dimensions are treated as batch dimensions) `float` `Tensor` indicating the standard deviation of the latent state transitions. initial_state_prior: instance of `tfd.MultivariateNormal` representing the prior distribution on latent states. Must have event shape `[num_features]`. observation_noise_scale: Scalar (any additional dimensions are treated as batch dimensions) `float` `Tensor` indicating the standard deviation of the observation noise. Default value: `0.`. initial_step: scalar `int` `Tensor` specifying the starting timestep. Default value: `0`. validate_args: Python `bool`. Whether to validate input with asserts. If `validate_args` is `False`, and the inputs are invalid, correct behavior is not guaranteed. Default value: `False`. allow_nan_stats: Python `bool`. If `False`, raise an exception if a statistic (e.g. mean/mode/etc...) is undefined for any batch member. If `True`, batch members with valid parameters leading to undefined statistics will return NaN for this statistic. Default value: `True`. name: Python `str` name prefixed to ops created by this class. Default value: 'DynamicLinearRegressionStateSpaceModel'. """ with tf.name_scope( name or 'DynamicLinearRegressionStateSpaceModel') as name: dtype = dtype_util.common_dtype( [design_matrix, drift_scale, initial_state_prior]) design_matrix = tf.convert_to_tensor( value=design_matrix, name='design_matrix', dtype=dtype) design_matrix_with_time_in_first_dim = distribution_util.move_dimension( design_matrix, -2, 0) drift_scale = tf.convert_to_tensor( value=drift_scale, name='drift_scale', dtype=dtype) observation_noise_scale = tf.convert_to_tensor( value=observation_noise_scale, name='observation_noise_scale', dtype=dtype) num_features = prefer_static.shape(design_matrix)[-1] def observation_matrix_fn(t): observation_matrix = tf.linalg.LinearOperatorFullMatrix( tf.gather(design_matrix_with_time_in_first_dim, t)[..., tf.newaxis, :], name='observation_matrix') return observation_matrix self._drift_scale = drift_scale self._observation_noise_scale = observation_noise_scale super(DynamicLinearRegressionStateSpaceModel, self).__init__( num_timesteps=num_timesteps, transition_matrix=tf.linalg.LinearOperatorIdentity( num_rows=num_features, dtype=dtype, name='transition_matrix'), transition_noise=tfd.MultivariateNormalDiag( scale_diag=(drift_scale[..., tf.newaxis] * tf.ones([num_features], dtype=dtype)), name='transition_noise'), observation_matrix=observation_matrix_fn, observation_noise=tfd.MultivariateNormalDiag( scale_diag=observation_noise_scale[..., tf.newaxis], name='observation_noise'), initial_state_prior=initial_state_prior, initial_step=initial_step, allow_nan_stats=allow_nan_stats, validate_args=validate_args, name=name)
def covariance(x, y=None, sample_axis=0, event_axis=-1, keepdims=False, name=None): """Sample covariance between observations indexed by `event_axis`. Given `N` samples of scalar random variables `X` and `Y`, covariance may be estimated as ```none Cov[X, Y] := N^{-1} sum_{n=1}^N (X_n - Xbar) Conj{(Y_n - Ybar)} Xbar := N^{-1} sum_{n=1}^N X_n Ybar := N^{-1} sum_{n=1}^N Y_n ``` For vector-variate random variables `X = (X1, ..., Xd)`, `Y = (Y1, ..., Yd)`, one is often interested in the covariance matrix, `C_{ij} := Cov[Xi, Yj]`. ```python x = tf.random.normal(shape=(100, 2, 3)) y = tf.random.normal(shape=(100, 2, 3)) # cov[i, j] is the sample covariance between x[:, i, j] and y[:, i, j]. cov = tfp.stats.covariance(x, y, sample_axis=0, event_axis=None) # cov_matrix[i, m, n] is the sample covariance of x[:, i, m] and y[:, i, n] cov_matrix = tfp.stats.covariance(x, y, sample_axis=0, event_axis=-1) ``` Notice we divide by `N`, which does not create `NaN` when `N = 1`, but is slightly biased. Args: x: A numeric `Tensor` holding samples. y: Optional `Tensor` with same `dtype` and `shape` as `x`. Default value: `None` (`y` is effectively set to `x`). sample_axis: Scalar or vector `Tensor` designating axis holding samples, or `None` (meaning all axis hold samples). Default value: `0` (leftmost dimension). event_axis: Scalar or vector `Tensor`, or `None` (scalar events). Axis indexing random events, whose covariance we are interested in. If a vector, entries must form a contiguous block of dims. `sample_axis` and `event_axis` should not intersect. Default value: `-1` (rightmost axis holds events). keepdims: Boolean. Whether to keep the sample axis as singletons. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., `'covariance'`). Returns: cov: A `Tensor` of same `dtype` as the `x`, and rank equal to `rank(x) - len(sample_axis) + 2 * len(event_axis)`. Raises: AssertionError: If `x` and `y` are found to have different shape. ValueError: If `sample_axis` and `event_axis` are found to overlap. ValueError: If `event_axis` is found to not be contiguous. """ with tf.name_scope(name or 'covariance'): x = tf.convert_to_tensor(x, name='x') # Covariance *only* uses the centered versions of x (and y). x = x - tf.reduce_mean(x, axis=sample_axis, keepdims=True) if y is None: y = x else: y = tf.convert_to_tensor(y, name='y', dtype=x.dtype) # If x and y have different shape, sample_axis and event_axis will likely # be wrong for one of them! tensorshape_util.assert_is_compatible_with(x.shape, y.shape) y = y - tf.reduce_mean(y, axis=sample_axis, keepdims=True) if event_axis is None: return tf.reduce_mean(x * tf.math.conj(y), axis=sample_axis, keepdims=keepdims) if sample_axis is None: raise ValueError( 'sample_axis was None, which means all axis hold events, and this ' 'overlaps with event_axis ({})'.format(event_axis)) event_axis = _make_positive_axis(event_axis, ps.rank(x)) sample_axis = _make_positive_axis(sample_axis, ps.rank(x)) # If we get lucky and axis is statically defined, we can do some checks. if _is_list_like(event_axis) and _is_list_like(sample_axis): event_axis = tuple(map(int, event_axis)) sample_axis = tuple(map(int, sample_axis)) if set(event_axis).intersection(sample_axis): raise ValueError( 'sample_axis ({}) and event_axis ({}) overlapped'.format( sample_axis, event_axis)) if (np.diff(np.array(sorted(event_axis))) > 1).any(): raise ValueError( 'event_axis must be contiguous. Found: {}'.format( event_axis)) batch_axis = list( sorted( set(range(tensorshape_util.rank( x.shape))).difference(sample_axis + event_axis))) else: batch_axis = ps.setdiff1d(ps.range(0, ps.rank(x)), ps.concat((sample_axis, event_axis), 0)) event_axis = ps.cast(event_axis, dtype=tf.int32) sample_axis = ps.cast(sample_axis, dtype=tf.int32) batch_axis = ps.cast(batch_axis, dtype=tf.int32) # Permute x/y until shape = B + E + S perm_for_xy = ps.concat((batch_axis, event_axis, sample_axis), 0) x_permed = tf.transpose(a=x, perm=perm_for_xy) y_permed = tf.transpose(a=y, perm=perm_for_xy) batch_ndims = ps.size(batch_axis) batch_shape = ps.shape(x_permed)[:batch_ndims] event_ndims = ps.size(event_axis) event_shape = ps.shape(x_permed)[batch_ndims:batch_ndims + event_ndims] sample_shape = ps.shape(x_permed)[batch_ndims + event_ndims:] sample_ndims = ps.size(sample_shape) n_samples = ps.reduce_prod(sample_shape) n_events = ps.reduce_prod(event_shape) # Flatten sample_axis into one long dim. x_permed_flat = tf.reshape( x_permed, ps.concat((batch_shape, event_shape, [n_samples]), 0)) y_permed_flat = tf.reshape( y_permed, ps.concat((batch_shape, event_shape, [n_samples]), 0)) # Do the same for event_axis. x_permed_flat = tf.reshape( x_permed, ps.concat((batch_shape, [n_events], [n_samples]), 0)) y_permed_flat = tf.reshape( y_permed, ps.concat((batch_shape, [n_events], [n_samples]), 0)) # After matmul, cov.shape = batch_shape + [n_events, n_events] cov = tf.matmul(x_permed_flat, y_permed_flat, adjoint_b=True) / ps.cast(n_samples, x.dtype) # Insert some singletons to make # cov.shape = batch_shape + event_shape**2 + [1,...,1] # This is just like x_permed.shape, except the sample_axis is all 1's, and # the [n_events] became event_shape**2. cov = tf.reshape( cov, ps.concat( ( batch_shape, # event_shape**2 used here because it is the same length as # event_shape, and has the same number of elements as one # batch of covariance. event_shape**2, ps.ones([sample_ndims], tf.int32)), 0)) # Permuting by the argsort inverts the permutation, making # cov.shape have ones in the position where there were samples, and # [n_events * n_events] in the event position. cov = tf.transpose(a=cov, perm=ps.invert_permutation(perm_for_xy)) # Now expand event_shape**2 into event_shape + event_shape. # We here use (for the first time) the fact that we require event_axis to be # contiguous. e_start = event_axis[0] e_len = 1 + event_axis[-1] - event_axis[0] cov = tf.reshape( cov, ps.concat((ps.shape(cov)[:e_start], event_shape, event_shape, ps.shape(cov)[e_start + e_len:]), 0)) # tf.squeeze requires python ints for axis, not Tensor. This is enough to # require our axis args to be constants. if not keepdims: squeeze_axis = ps.where(sample_axis < e_start, sample_axis, sample_axis + e_len) cov = _squeeze(cov, axis=squeeze_axis) return cov
def value(self): return tf.convert_to_tensor(42.)
def __init__(self, input_): self._input = input_ self.value = tf.convert_to_tensor([[42.]])
def __init__(self, leverage_fn, variance_process, risk_free_rate=None, dividend_yield=None, rho=None, dtype=None, name=None): """Initializes the Local stochastic volatility model. Args: leverage_fn: A Python callable which returns the Leverage function `L(t, S(t))` as a function of state and time. The function must accept a scalar `Tensor` corresponding to time 't' and a real `Tensor` of shape `[num_samples, 1]` corresponding to the underlying price (S) as inputs and return a real `Tensor` containing the leverage function computed at (S,t). variance_process: An instance of `ItoProcess` specifying the dynamics of the variance process of the LSV model. The `variance_process` should implement a one-factor stochastic process. For the common version of Heston like variance model use `LSVVarianceModel`. risk_free_rate: An optional scalar real `Tensor` specifying the (continuously compounded) risk free interest rate. If the underlying is an FX rate, then use this input to specify the domestic interest rate. Note that the current implementation supports constant interest rates and dividend yield. Default value: `None` in which case the input is set to zero. dividend_yield: An optional real scalar `Tensor` specifying the (continuosly compounded) dividend yield. If the underlying is an FX rate, then use this input to specify the foreign interest rate. Note that the currect implementation supports constant interest rates and dividend yield. Default value: `None` in which case the input is set to zero. rho: A real scalar `Tensor` specifying the correlation between the underlying spot price and the variance process. Default value: `None` in which case cross correlations are assumed to be zero. dtype: The default dtype to use when converting values to `Tensor`s. Default value: `None` which means that default dtypes inferred by TensorFlow are used. name: Python string. The name to give to the ops created by this class. Default value: `None` which maps to the default name `local_stochastic_volatility_model`. """ self._name = name or "local_stochastic_volatility_model" with tf.name_scope(self._name): if risk_free_rate is None: risk_free_rate = 0.0 if dividend_yield is None: dividend_yield = 0.0 self._risk_free_rate = tf.convert_to_tensor(risk_free_rate, dtype=dtype) self._dtype = dtype or self._domestic_rate.dtype self._dividend_yield = tf.convert_to_tensor(dividend_yield, dtype=dtype) self._leverage_fn = leverage_fn self._variance_process = variance_process dim = 1 + variance_process.dim() rho = rho or 0.0 self._rho = _create_corr_matrix(rho, self._dtype) self._sqrt_rho = tf.linalg.cholesky(self._rho) def _vol_fn(t, state): """Volatility function of LSV model.""" num_samples = state.shape.as_list()[0] broadcasted_t = tf.broadcast_to(t, [1, num_samples]) spot_prices = state[:, 0] variance = state[:, 1:] level_fun = self._leverage_fn( broadcasted_t, tf.expand_dims(spot_prices, axis=0)) spot_diffusion = tf.expand_dims( level_fun[0, :], axis=-1) * tf.expand_dims( spot_prices, axis=-1) * tf.math.sqrt(variance) variance_diffusion = self._variance_process.volatility_fn()( t, variance) diffusion = tf.concat([spot_diffusion, variance_diffusion], axis=1) diffusion = tf.expand_dims(diffusion, axis=-2) return diffusion * self._sqrt_rho # Drift function def _drift_fn(t, state): """Drift function of LSV model.""" spot_drift = (self._risk_free_rate - self._dividend_yield) * state[:, :1] variance_drift = self._variance_process.drift_fn()(t, state[:, 1:]) return tf.concat([spot_drift, variance_drift], axis=1) super(LocalStochasticVolatilityModel, self).__init__(dim, _drift_fn, _vol_fn, self._dtype, self._name)
def expected_calibration_error(num_bins, logits=None, labels_true=None, labels_predicted=None, name=None): """Compute the Expected Calibration Error (ECE). This method implements equation (3) in [1]. In this equation the probability of the decided label being correct is used to estimate the calibration property of the predictor. Note: a trade-off exist between using a small number of `num_bins` and the estimation reliability of the ECE. In particular, this method may produce unreliable ECE estimates in case there are few samples available in some bins. As an alternative to this method, consider also using `bayesian_expected_calibration_error`. #### References [1]: Chuan Guo, Geoff Pleiss, Yu Sun, Kilian Q. Weinberger, On Calibration of Modern Neural Networks. Proceedings of the 34th International Conference on Machine Learning (ICML 2017). arXiv:1706.04599 https://arxiv.org/pdf/1706.04599.pdf Args: num_bins: int, number of probability bins, e.g. 10. logits: Tensor, (n,nlabels), with logits for n instances and nlabels. labels_true: Tensor, (n,), with tf.int32 or tf.int64 elements containing ground truth class labels in the range [0,nlabels]. labels_predicted: Tensor, (n,), with tf.int32 or tf.int64 elements containing decisions of the predictive system. If `None`, we will use the argmax decision using the `logits`. name: Python `str` name prefixed to Ops created by this function. Returns: ece: Tensor, scalar, tf.float32. """ with tf.name_scope(name or 'expected_calibration_error'): logits = tf.convert_to_tensor(logits) labels_true = tf.convert_to_tensor(labels_true) if labels_predicted is not None: labels_predicted = tf.convert_to_tensor(labels_predicted) # Compute empirical counts over the events defined by the sets # {incorrect,correct}x{0,1,..,num_bins-1}, as well as the empirical averages # of predicted probabilities in each probability bin. event_bin_counts, pmean_observed = _compute_calibration_bin_statistics( num_bins, logits=logits, labels_true=labels_true, labels_predicted=labels_predicted) # Compute the marginal probability of observing a probability bin. event_bin_counts = tf.cast(event_bin_counts, tf.float32) bin_n = tf.reduce_sum(event_bin_counts, axis=0) pbins = bin_n / tf.reduce_sum( bin_n) # Compute the marginal bin probability # Compute the marginal probability of making a correct decision given an # observed probability bin. tiny = np.finfo(np.float32).tiny pcorrect = event_bin_counts[1, :] / (bin_n + tiny) # Compute the ECE statistic as defined in reference [1]. ece = tf.reduce_sum(pbins * tf.abs(pcorrect - pmean_observed)) return ece
def from_volatility_surface(cls, implied_volatility_surface, variance_process, initial_spot, initial_variance, rho=None, risk_free_rate=None, dividend_yield=None, time_step=None, num_grid_points=None, grid_minimums=None, grid_maximums=None, dtype=None): """Creates a `LocalStochasticVolatilityModel` from volatility surface. This function computes the leverage function for the LSV model by first computing the joint probablity density function `p(t, X(t), v(t))` where `X(t)` is the log of the spot price and `v(t)` is the variance at time `t`. The joint probablity density is computed using the Fokker-Planck equation of the LSV model (see 6.8.2 in Ref [1]): ```None dp/dt = 1/2 d^2 [v L(t,X)^2 p]/dX^2 + 1/2 d^2 [b(v)^2 p]/dv^2 + rho d^2 [sqrt(v)L(t,X)b(v) p]/dXdv - d[(r - d - 1/2 v L(t,X)^2)p]/dX - d[a(v) p]/dv ``` where `a(v)` and `b(v)` are the drift and diffusion functions for the variance process. Defining ```None I_n(k,t) = int v^n p(t, k, v) dv ``` we can calculate the leverage function as follows: ```None L(k, t) = sigma(exp(k), t) sqrt(I_0(k, t)/I_1(k, t)). ``` Args: implied_volatility_surface: Either an instance of `processed_market_data.VolatilitySurface` or a Python object containing the implied volatility market data. If the input is a Python object, then the object must implement a function `volatility(strike, expiry_times)` which takes real `Tensor`s corresponding to option strikes and time to expiry and returns a real `Tensor` containing the correspoding market implied volatility. variance_process: An instance of `LSVVarianceModel` or `ItoProcess`specifying the dynamics of the variance process of the LSV model. initial_spot: A real scalar `Tensor` specifying the underlying spot price on the valuation date. initial_variance: A real scalar `Tensor` specifying the initial variance on the valuation date. rho: A real scalar `Tensor` specifying the correlation between spot price and the stochastic variance. risk_free_rate: A real scalar `Tensor` specifying the (continuosly compounded) risk free interest rate. If the underlying is an FX rate, then use this input to specify the domestic interest rate. Default value: `None` in which case the input is set to zero. dividend_yield: A real scalar `Tensor` specifying the (continuosly compounded) divident yield. If the underlying is an FX rate, then use this input to specify the foreign interest rate. Default value: `None` in which case the input is set to zero. time_step: An optional real scalar `Tensor` specifying the time step during the numerical solution of the Fokker-Planck PDE. Default value: None, in which case `time_step` corresponding to 100 time steps is used. num_grid_points: A scalar integer `Tensor` specifying the number of discretization points for each spatial dimension. Default value: None, in which case number of grid points is set to 100. grid_minimums: An optional `Tensor` of size 2 containing the minimum grid points for PDE spatial discretization. `grid_minimums[0]` correspond to the minimum spot price in the spatial grid and `grid_minimums[1]` correspond to the minimum variance value. grid_maximums: An optional `Tensor` of size 2 containing the maximum grid points for PDE spatial discretization. `grid_maximums[0]` correspond to the maximum spot price in the spatial grid and `grid_maximums[1]` correspond to the maximum variance value. dtype: The default dtype to use when converting values to `Tensor`s. Default value: `None` which means that default dtypes inferred by TensorFlow are used. Returns: An instance of `LocalStochasticVolatilityModel` constructed using the input data. """ if risk_free_rate is None: discount_factor_fn = lambda t: tf.ones_like(t, dtype=dtype) else: r = tf.convert_to_tensor(risk_free_rate, dtype=dtype) discount_factor_fn = lambda t: tf.math.exp(-r * t) lv_model = lvm.LocalVolatilityModel.from_volatility_surface( dim=1, spot=initial_spot, implied_volatility_surface=implied_volatility_surface, discount_factor_fn=discount_factor_fn, dividend_yield=dividend_yield, dtype=dtype) dtype = dtype or lv_model.dtype() day_count_fn = utils.get_daycount_fn( implied_volatility_surface.daycount_convention) max_time = tf.math.reduce_max( day_count_fn( start_date=implied_volatility_surface.settlement_date(), end_date=implied_volatility_surface.node_expiries())) if time_step is None: time_step = max_time / 100.0 rho = rho or 0.0 num_grid_points = num_grid_points or 100 leverage_fn = _leverage_function_using_pde( risk_free_rate=risk_free_rate, dividend_yield=dividend_yield, lv_model=lv_model, variance_model=variance_process, rho=[rho], initial_spot=initial_spot, initial_variance=initial_variance, time_step=time_step, max_time=max_time, num_grid_points=num_grid_points, grid_minimums=grid_minimums, grid_maximums=grid_maximums, dtype=dtype) return LocalStochasticVolatilityModel(leverage_fn, variance_process, risk_free_rate=risk_free_rate, dividend_yield=dividend_yield, rho=rho, dtype=dtype)
def _leverage_function_using_pde(*, risk_free_rate, dividend_yield, lv_model, variance_model, rho, initial_spot, initial_variance, max_time, time_step, num_grid_points, grid_minimums, grid_maximums, dtype): """Computes Leverage function using Fokker-Planck PDE for joint density. This function computes the leverage function for the LSV model by first computing the joint probablity density function `p(t, X(t), v(t))` where `X(t)` is the log of the spot price and `v(t)` is the variance at time `t`. The joint probablity density is computed using the Fokker-Planck equation of the LSV model (see 6.8.2 in Ref [1]): ```None dp/dt = 1/2 d^2 [v L(t,X)^2 p]/dX^2 + 1/2 d^2 [b(v)^2 p]/dv^2 + rho d^2 [sqrt(v)L(t,X)b(v) p]/dXdv - d[(r - d - 1/2 v L(t,X)^2)p]/dX - d[a(v) p]/dv ``` where `a(v)` and `b(v)` are the drift and diffusion functions for the variance process. Defining ```None I_n(k,t) = int v^n p(t, k, v) dv ``` we can calculate the leverage function as follows: ```None L(k, t) = sigma(exp(k), t) sqrt(I_0(k, t)/I_1(k, t)). ``` Args: risk_free_rate: A scalar real `Tensor` specifying the (continuosly compounded) risk free interest rate. If the underlying is an FX rate, then use this input to specify the domestic interest rate. dividend_yield: A real scalar `Tensor` specifying the (continuosly compounded) dividend yield. If the underlying is an FX rate, then use this input to specify the foreign interest rate. lv_model: An instance of `LocalVolatilityModel` specifying the local volatility for the spot price. variance_model: An instance of `LSVVarianceModel` specifying the dynamics of the variance process of the LSV model. rho: A real scalar `Tensor` specifying the correlation between spot price and the stochastic variance. initial_spot: A real scalar `Tensor` specifying the underlying spot price on the valuation date. initial_variance: A real scalar `Tensor` specifying the initial variance on the valuation date. max_time: A real scalar `Tensor` specifying the maximum time to which the Fokker-Planck PDE is evolved. time_step: A real scalar `Tensor` specifying the time step during the numerical solution of the Fokker-Planck PDE. num_grid_points: A scalar integer `Tensor` specifying the number of discretization points for each spatial dimension. grid_minimums: An optional `Tensor` of size 2 containing the minimum grid points for PDE spatial discretization. `grid_minimums[0]` correspond to the minimum spot price in the spatial grid and `grid_minimums[1]` correspond to the minimum variance value. grid_maximums: An optional `Tensor` of size 2 containing the maximum grid points for PDE spatial discretization. `grid_maximums[0]` correspond to the maximum spot price in the spatial grid and `grid_maximums[1]` correspond to the maximum variance value. dtype: The default dtype to use when converting values to `Tensor`s. Returns: A Python callable which computes the Leverage function `L(t, S(t))`. The function accepts a scalar `Tensor` corresponding to time 't' and a real `Tensor` of shape `[num_samples, 1]` corresponding to the spot price (S) as inputs and return a real `Tensor` corresponding to the leverage function computed at (S,t). """ if variance_model.dim() > 1: raise ValueError( "The default model of Leverage function doesn\'t support " "the variance process with more than 1 factor.") pde_grid_tol = _machine_eps(dtype) rho = tf.convert_to_tensor(rho, dtype=dtype) initial_spot = tf.convert_to_tensor(initial_spot, dtype=dtype) initial_log_spot = tf.math.log( tf.convert_to_tensor(initial_spot, dtype=dtype)) initial_variance = tf.convert_to_tensor(initial_variance, dtype=dtype) risk_free_rate = tf.convert_to_tensor(risk_free_rate, dtype=dtype) dividend_yield = tf.convert_to_tensor(dividend_yield, dtype=dtype) rho = tf.convert_to_tensor(rho, dtype=dtype) x_scale = initial_log_spot y_scale = initial_variance # scaled log spot = log(spot/initial_spot) # scaled variance = variance / initial_variance scaled_initial_point = tf.convert_to_tensor([0.0, 1.0], dtype=dtype) # These are minimums and maximums for scaled log spot and scaled variance if grid_minimums is None: grid_minimums = [0.01, 0.0001] else: grid_minimums = tf.convert_to_tensor(grid_minimums, dtype=dtype) grid_minimums = [ grid_minimums[0] / initial_spot, grid_minimums[1] / initial_variance ] if grid_maximums is None: grid_maximums = [10.0, 5.0] else: grid_maximums = tf.convert_to_tensor(grid_maximums, dtype=dtype) grid_maximums = [ grid_maximums[0] / initial_spot, grid_maximums[1] / initial_variance ] log_spot_min = tf.math.log( tf.convert_to_tensor([grid_minimums[0]], dtype=dtype)) log_spot_max = tf.math.log( tf.convert_to_tensor([grid_maximums[0]], dtype=dtype)) variance_min = tf.convert_to_tensor([grid_minimums[1]], dtype=dtype) variance_max = tf.convert_to_tensor([grid_maximums[1]], dtype=dtype) grid_minimums = tf.concat([log_spot_min, variance_min], axis=0) grid_maximums = tf.concat([log_spot_max, variance_max], axis=0) grid = _tavella_randell_nonuniform_grid(grid_minimums, grid_maximums, scaled_initial_point, num_grid_points, 0.3, dtype) grid = [tf.expand_dims(grid[0], axis=0), tf.expand_dims(grid[1], axis=0)] delta_x = tf.math.reduce_min(grid[0][0, 1:] - grid[0][0, :-1]) delta_y = tf.math.reduce_min(grid[1][0, 1:] - grid[1][0, :-1]) # Initialize leverage function L(t=0, S) = 1 leverage_fn = functools.partial(linear.interpolate, x_data=[[0.0, 1.0]], y_data=[[1.0, 1.0]], dtype=dtype) def _initial_value(): """Computes initial value as a delta function delta(log_spot(t), var(0)).""" log_spot, variance = tf.meshgrid(*grid) init_value = tf.where( tf.math.logical_and( tf.math.abs(log_spot - scaled_initial_point[0]) < delta_x + pde_grid_tol, tf.math.abs(variance - scaled_initial_point[1]) < delta_y + pde_grid_tol), 1.0 / (delta_x * delta_y * 4), 0.0) # initial_value.shape = (1, num_grid_x, num_grid_y) return tf.expand_dims(init_value, axis=0) def _second_order_coeff_fn(t, grid): log_spot = grid[0] + x_scale variance = grid[1] * y_scale leverage_fn_t_x = leverage_fn(log_spot) val_xx = 0.5 * variance * leverage_fn_t_x**2 val_xy = 0.5 * (rho * tf.math.sqrt(variance) * leverage_fn_t_x * variance_model.volatility_fn()(t, variance)) / y_scale val_yx = val_xy val_yy = 0.5 * variance_model.volatility_fn()(t, variance)**2 / y_scale**2 # return list of shape = (2,2). Each element has shape = grid.shape return [[-val_yy, -val_yx], [-val_xy, -val_xx]] def _first_order_coeff_fn(t, grid): log_spot = grid[0] + x_scale variance = grid[1] * y_scale leverage_fn_t_x = leverage_fn(log_spot) val_x = (risk_free_rate - dividend_yield - 0.5 * variance * leverage_fn_t_x**2) val_y = variance_model.drift_fn()(t, variance) # return list of shape = (2,). Each element has shape = grid.shape return [val_y / y_scale, val_x] def _compute_leverage_fn(t, coord_grid, value_grid): log_spot = tf.expand_dims(coord_grid[0], axis=-1) + x_scale local_volatility_values = lv_model.local_volatility_fn()( t, tf.math.exp(log_spot)) # TODO(b/176826650): Large values represent instability. Eventually this # should be addressed inside local vol model. local_volatility_values = tf.where( tf.math.abs(local_volatility_values) > 1e4, 0.0, local_volatility_values) # variance_given_logspot.shape = (num_grid_x, 1) variance_given_logspot = _conditional_expected_variance_from_pde_solution( [coord_grid[0] + x_scale, coord_grid[1] * y_scale], value_grid, dtype)(log_spot) leverage_fn_values = tf.math.divide_no_nan( local_volatility_values, tf.math.sqrt(variance_given_logspot)) leverage_fn = functools.partial( linear.interpolate, x_data=grid[0] + x_scale, y_data=tf.transpose(leverage_fn_values), dtype=dtype) return leverage_fn @pde.boundary_conditions.neumann def _trivial_neumann_boundary(t, location_grid): del t, location_grid return 0.0 leverage_fn_values = [] leverage_fn_values.append(leverage_fn(grid[0][0])[0]) # joint_density.shape = (1, num_grid_x, num_grid_y) joint_density = _initial_value() for tstart in np.arange(0.0, max_time, time_step): joint_density, coord_grid, _, _ = pde.fd_solvers.solve_forward( tstart, tstart + time_step, coord_grid=[grid[0][0], grid[1][0]], values_grid=joint_density, time_step=time_step / 10.0, values_transform_fn=None, inner_second_order_coeff_fn=_second_order_coeff_fn, inner_first_order_coeff_fn=_first_order_coeff_fn, zeroth_order_coeff_fn=None, boundary_conditions=[[ _trivial_neumann_boundary, _trivial_neumann_boundary ], [_trivial_neumann_boundary, _trivial_neumann_boundary]], dtype=dtype) joint_density = tf.math.maximum(joint_density, 0.0) area_under_joint_density = _two_d_integration( [grid[0][0, :], grid[1][0, :]], joint_density) joint_density = joint_density / area_under_joint_density # TODO(b/176826743): Perform fixed point iteration instead of one step # update leverage_fn = _compute_leverage_fn( tf.convert_to_tensor(tstart + time_step), coord_grid, joint_density) leverage_fn_values.append(leverage_fn(grid[0][0, :] + x_scale)[0, :]) # leverage_fn_values.shape = (num_pde_timesteps, num_grid_x,) leverage_fn_values = tf.convert_to_tensor(leverage_fn_values, dtype=dtype) times = tf.range(0.0, max_time + time_step, time_step, dtype=dtype) def _return_fn(t, spot): leverage_fn_interpolator = ( math.interpolation.interpolation_2d.Interpolation2D( x_data=[times], y_data=tf.expand_dims(tf.repeat(grid[0] + x_scale, times.shape[0], axis=0), axis=0), z_data=tf.expand_dims(leverage_fn_values, axis=0), dtype=dtype)) return leverage_fn_interpolator.interpolate(t, tf.math.log(spot)) return _return_fn
def update(value_and_gradients_function, val_left, val_right, val_trial, f_lim, active=None): """Squeezes a bracketing interval containing the minimum. Given an interval which brackets a minimum and a point in that interval, finds a smaller nested interval which also brackets the minimum. If the supplied point does not lie in the bracketing interval, the current interval is returned. The following description is given in terms of individual points evaluated on a line function to be minimized. Note, however, the implementation also accepts batches of points allowing to minimize multiple line functions at once. See details on the docstring of `value_and_gradients_function` below. The requirement of the interval bracketing a minimum is expressed through the opposite slope conditions. Assume the left end point is 'a', the right end point is 'b', the function to be minimized is 'f' and the derivative is 'df'. The update procedure relies on the following conditions being satisfied: ''' f(a) <= f(0) + epsilon (1) df(a) < 0 (2) df(b) > 0 (3) ''' In the first condition, epsilon is a small positive constant. The condition demands that the function at the left end point be not much bigger than the starting point (i.e. 0). This is an easy to satisfy condition because by assumption, we are in a direction where the function value is decreasing. The second and third conditions together demand that there is at least one zero of the derivative in between a and b. In addition to the interval, the update algorithm requires a third point to be supplied. Usually, this point would lie within the interval [a, b]. If the point is outside this interval, the current interval is returned. If the point lies within the interval, the behaviour of the function and derivative value at this point is used to squeeze the original interval in a manner that preserves the opposite slope conditions. For further details of this component, see the procedure U0-U3 on page 123 of the [Hager and Zhang (2006)][2] article. Note that this function does not explicitly verify whether the opposite slope conditions are satisfied for the supplied interval. It is assumed that this is so. Args: value_and_gradients_function: A Python callable that accepts a real scalar tensor and returns an object that can be converted to a namedtuple. The namedtuple should have fields 'f' and 'df' that correspond to scalar tensors of real dtype containing the value of the function and its derivative at that point. The other namedtuple fields, if present, should be tensors or sequences (possibly nested) of tensors. In usual optimization application, this function would be generated by projecting the multivariate objective function along some specific direction. The direction is determined by some other procedure but should be a descent direction (i.e. the derivative of the projected univariate function must be negative at 0.). Alternatively, the function may represent the batching of `n` such line functions (e.g. projecting a single multivariate objective function along `n` distinct directions at once) accepting n points as input, i.e. a tensor of shape [n], and the fields 'f' and 'df' in the returned namedtuple should each be a tensor of shape [n], with the corresponding function values and derivatives at the input points. val_left: Return value of value_and_gradients_function at the left end point of the bracketing interval (labelles 'a' above). val_right: Return value of value_and_gradients_function at the right end point of the bracketing interval (labelles 'b' above). val_trial: Return value of value_and_gradients_function at the trial point to be used to shrink the interval (labelled 'c' above). f_lim: real `Tensor` of shape [n]. The function value threshold for the approximate Wolfe conditions to be checked for each batch member. active: optional boolean `Tensor` of shape [n]. Relevant in batching mode only, indicates batch members on which the update procedure should be applied. On non-active members the current left/right interval is returned unmodified. Returns: A namedtuple containing the following fields: iteration: An int32 scalar `Tensor`. The number of iterations performed by the bisect algorithm. stopped: A boolean `Tensor` of shape [n]. True for those batch members where the bisection algorithm terminated. failed: A boolean `Tensor` of shape [n]. True for those batch members where an error was encountered. num_evals: An int32 scalar `Tensor`. The number of times the objective function was evaluated. left: Return value of value_and_gradients_function at the updated left end point of the interval found. right: Return value of value_and_gradients_function at the updated right end point of the interval found. """ # We should only update if the trial point is within the interval. within_range = (val_left.x < val_trial.x) & (val_trial.x < val_right.x) if active is not None: within_range = within_range & active # The new point is a valid left end point if it has negative slope # and the value at the point is not too large. valid_left = (val_trial.df < 0) & (val_trial.f <= f_lim) # If the trial point has a negative slope but the value at that point # is too high, bisect can narrow down an interval between the current left # and the trial point. needs_bisect = within_range & (val_trial.df < 0) & (val_trial.f > f_lim) # Note that if `~valid_left` it is because either: # - the slope at the trial point is positive, so it is a valid right # point, or # - the needs_bisect condition is true. # In both cases we want to keep the current left and replace right # with the trial point. left = val_where(within_range & valid_left, val_trial, val_left) right = val_where(within_range & ~valid_left, val_trial, val_right) bisect_args = _IntermediateResult( iteration=tf.convert_to_tensor(0), stopped=~needs_bisect, failed=tf.zeros_like(within_range), # i.e. all false. num_evals=tf.convert_to_tensor(0), left=left, right=right) return _bisect(value_and_gradients_function, bisect_args, f_lim)
def minimize(objective_function, initial_simplex=None, initial_vertex=None, step_sizes=None, objective_at_initial_simplex=None, objective_at_initial_vertex=None, batch_evaluate_objective=False, func_tolerance=1e-8, position_tolerance=1e-8, parallel_iterations=1, max_iterations=None, reflection=None, expansion=None, contraction=None, shrinkage=None, name=None): """Minimum of the objective function using the Nelder Mead simplex algorithm. Performs an unconstrained minimization of a (possibly non-smooth) function using the Nelder Mead simplex method. Nelder Mead method does not support univariate functions. Hence the dimensions of the domain must be 2 or greater. For details of the algorithm, see [Press, Teukolsky, Vetterling and Flannery(2007)][1]. Points in the domain of the objective function may be represented as a `Tensor` of general shape but with rank at least 1. The algorithm proceeds by modifying a full rank simplex in the domain. The initial simplex may either be specified by the user or can be constructed using a single vertex supplied by the user. In the latter case, if `v0` is the supplied vertex, the simplex is the convex hull of the set: ```None S = {v0} + {v0 + step_i * e_i} ``` Here `e_i` is a vector which is `1` along the `i`-th axis and zero elsewhere and `step_i` is a characteristic length scale along the `i`-th axis. If the step size is not supplied by the user, a unit step size is used in every axis. Alternately, a single step size may be specified which is used for every axis. The most flexible option is to supply a bespoke step size for every axis. ### Usage: The following example demonstrates the usage of the Nelder Mead minimzation on a two dimensional problem with the minimum located at a non-differentiable point. ```python # The objective function def sqrt_quadratic(x): return tf.sqrt(tf.reduce_sum(x ** 2, axis=-1)) start = tf.constant([6.0, -21.0]) # Starting point for the search. optim_results = tfp.optimizer.nelder_mead_minimize( sqrt_quadratic, initial_vertex=start, func_tolerance=1e-8, batch_evaluate_objective=True) # Check that the search converged assert(optim_results.converged) # Check that the argmin is close to the actual value. np.testing.assert_allclose(optim_results.position, np.array([0.0, 0.0]), atol=1e-7) # Print out the total number of function evaluations it took. print("Function evaluations: %d" % optim_results.num_objective_evaluations) ``` ### References: [1]: William Press, Saul Teukolsky, William Vetterling and Brian Flannery. Numerical Recipes in C++, third edition. pp. 502-507. (2007). http://numerical.recipes/cpppages/chap0sel.pdf [2]: Jeffrey Lagarias, James Reeds, Margaret Wright and Paul Wright. Convergence properties of the Nelder-Mead simplex method in low dimensions, Siam J. Optim., Vol 9, No. 1, pp. 112-147. (1998). http://www.math.kent.edu/~reichel/courses/Opt/reading.material.2/nelder.mead.pdf [3]: Fuchang Gao and Lixing Han. Implementing the Nelder-Mead simplex algorithm with adaptive parameters. Computational Optimization and Applications, Vol 51, Issue 1, pp 259-277. (2012). https://pdfs.semanticscholar.org/15b4/c4aa7437df4d032c6ee6ce98d6030dd627be.pdf Args: objective_function: A Python callable that accepts a point as a real `Tensor` and returns a `Tensor` of real dtype containing the value of the function at that point. The function to be minimized. If `batch_evaluate_objective` is `True`, the callable may be evaluated on a `Tensor` of shape `[n+1] + s ` where `n` is the dimension of the problem and `s` is the shape of a single point in the domain (so `n` is the size of a `Tensor` representing a single point). In this case, the expected return value is a `Tensor` of shape `[n+1]`. Note that this method does not support univariate functions so the problem dimension `n` must be strictly greater than 1. initial_simplex: (Optional) `Tensor` of real dtype. The initial simplex to start the search. If supplied, should be a `Tensor` of shape `[n+1] + s` where `n` is the dimension of the problem and `s` is the shape of a single point in the domain. Each row (i.e. the `Tensor` with a given value of the first index) is interpreted as a vertex of a simplex and hence the rows must be affinely independent. If not supplied, an axes aligned simplex is constructed using the `initial_vertex` and `step_sizes`. Only one and at least one of `initial_simplex` and `initial_vertex` must be supplied. initial_vertex: (Optional) `Tensor` of real dtype and any shape that can be consumed by the `objective_function`. A single point in the domain that will be used to construct an axes aligned initial simplex. step_sizes: (Optional) `Tensor` of real dtype and shape broadcasting compatible with `initial_vertex`. Supplies the simplex scale along each axes. Only used if `initial_simplex` is not supplied. See description above for details on how step sizes and initial vertex are used to construct the initial simplex. objective_at_initial_simplex: (Optional) Rank `1` `Tensor` of real dtype of a rank `1` `Tensor`. The value of the objective function at the initial simplex. May be supplied only if `initial_simplex` is supplied. If not supplied, it will be computed. objective_at_initial_vertex: (Optional) Scalar `Tensor` of real dtype. The value of the objective function at the initial vertex. May be supplied only if the `initial_vertex` is also supplied. batch_evaluate_objective: (Optional) Python `bool`. If True, the objective function will be evaluated on all the vertices of the simplex packed into a single tensor. If False, the objective will be mapped across each vertex separately. Evaluating the objective function in a batch allows use of vectorization and should be preferred if the objective function allows it. func_tolerance: (Optional) Scalar `Tensor` of real dtype. The algorithm stops if the absolute difference between the largest and the smallest function value on the vertices of the simplex is below this number. position_tolerance: (Optional) Scalar `Tensor` of real dtype. The algorithm stops if the largest absolute difference between the coordinates of the vertices is below this threshold. parallel_iterations: (Optional) Positive integer. The number of iterations allowed to run in parallel. max_iterations: (Optional) Scalar positive `Tensor` of dtype `int32`. The maximum number of iterations allowed. If `None` then no limit is applied. reflection: (Optional) Positive Scalar `Tensor` of same dtype as `initial_vertex`. This parameter controls the scaling of the reflected vertex. See, [Press et al(2007)][1] for details. If not specified, uses the dimension dependent prescription of [Gao and Han(2012)][3]. expansion: (Optional) Positive Scalar `Tensor` of same dtype as `initial_vertex`. Should be greater than `1` and `reflection`. This parameter controls the expanded scaling of a reflected vertex. See, [Press et al(2007)][1] for details. If not specified, uses the dimension dependent prescription of [Gao and Han(2012)][3]. contraction: (Optional) Positive scalar `Tensor` of same dtype as `initial_vertex`. Must be between `0` and `1`. This parameter controls the contraction of the reflected vertex when the objective function at the reflected point fails to show sufficient decrease. See, [Press et al(2007)][1] for more details. If not specified, uses the dimension dependent prescription of [Gao and Han(2012][3]. shrinkage: (Optional) Positive scalar `Tensor` of same dtype as `initial_vertex`. Must be between `0` and `1`. This parameter is the scale by which the simplex is shrunk around the best point when the other steps fail to produce improvements. See, [Press et al(2007)][1] for more details. If not specified, uses the dimension dependent prescription of [Gao and Han(2012][3]. name: (Optional) Python str. The name prefixed to the ops created by this function. If not supplied, the default name 'minimize' is used. Returns: optimizer_results: A namedtuple containing the following items: converged: Scalar boolean tensor indicating whether the minimum was found within tolerance. num_objective_evaluations: The total number of objective evaluations performed. position: A `Tensor` containing the last argument value found during the search. If the search converged, then this value is the argmin of the objective function. objective_value: A tensor containing the value of the objective function at the `position`. If the search converged, then this is the (local) minimum of the objective function. final_simplex: The last simplex constructed before stopping. final_objective_values: The objective function evaluated at the vertices of the final simplex. initial_simplex: The starting simplex. initial_objective_values: The objective function evaluated at the vertices of the initial simplex. num_iterations: The number of iterations of the main algorithm body. Raises: ValueError: If any of the following conditions hold 1. If none or more than one of `initial_simplex` and `initial_vertex` are supplied. 2. If `initial_simplex` and `step_sizes` are both specified. """ with tf1.name_scope(name, 'minimize', [ initial_simplex, initial_vertex, step_sizes, objective_at_initial_simplex, objective_at_initial_vertex, func_tolerance, position_tolerance ]): (dim, _, simplex, objective_at_simplex, num_evaluations) = _prepare_args(objective_function, initial_simplex, initial_vertex, step_sizes, objective_at_initial_simplex, objective_at_initial_vertex, batch_evaluate_objective) domain_dtype = simplex.dtype (reflection, expansion, contraction, shrinkage) = _resolve_parameters(dim, reflection, expansion, contraction, shrinkage, domain_dtype) closure_kwargs = dict( objective_function=objective_function, dim=dim, func_tolerance=func_tolerance, position_tolerance=position_tolerance, batch_evaluate_objective=batch_evaluate_objective, reflection=reflection, expansion=expansion, contraction=contraction, shrinkage=shrinkage) def _loop_body(_, iterations, simplex, objective_at_simplex, num_evaluations): (converged, next_simplex, next_objective, evaluations) = nelder_mead_one_step(simplex, objective_at_simplex, **closure_kwargs) return (converged, iterations + 1, next_simplex, next_objective, num_evaluations + evaluations) initial_args = (False, 0, simplex, objective_at_simplex, num_evaluations) # Loop until either we have converged or if the max iterations are supplied # then until we have converged or exhausted the available iteration budget. def _is_converged(converged, num_iterations, *ignored_args): # pylint:disable=unused-argument # It is important to ensure that not_converged is a tensor. If # converged is not a tensor but a Python bool, then the overloaded # op '~' acts as bitwise complement so ~True = -2 and ~False = -1. # In that case, the loop will never terminate. not_converged = tf.logical_not(converged) return (not_converged if max_iterations is None else (not_converged & (num_iterations < max_iterations))) (converged, num_iterations, final_simplex, final_objective_values, final_evaluations) = tf.while_loop( cond=_is_converged, body=_loop_body, loop_vars=initial_args, parallel_iterations=parallel_iterations) order = tf.argsort(final_objective_values, direction='ASCENDING', stable=True) best_index = order[0] # The explicit cast to Tensor below is done to avoid returning a mixture # of Python types and Tensors which cause problems with session.run. # In the eager mode, converged may remain a Python bool. Trying to evaluate # the whole tuple in one evaluate call will raise an exception because # of the presence of non-tensors. This is very annoying so we explicitly # cast those arguments to Tensors. return NelderMeadOptimizerResults( converged=tf.convert_to_tensor(value=converged), num_objective_evaluations=final_evaluations, position=final_simplex[best_index], objective_value=final_objective_values[best_index], final_simplex=final_simplex, final_objective_values=final_objective_values, num_iterations=tf.convert_to_tensor(value=num_iterations), initial_simplex=simplex, initial_objective_values=objective_at_simplex)
def __init__(self, logits): self._logits = tf.convert_to_tensor(logits) super(StatefulCategorical, self).__init__( dtype=tf.int32, reparameterization_type=tfd.NOT_REPARAMETERIZED, validate_args=False, allow_nan_stats=False)
def _logits_parameter_no_checks(self): if self._logits is None: probs = tf.convert_to_tensor(self._probs) return tf.math.log(probs) - tf.math.log1p(-probs) return tf.identity(self._logits)
def brier_decomposition(labels, logits, name=None): r"""Decompose the Brier score into uncertainty, resolution, and reliability. [Proper scoring rules][1] measure the quality of probabilistic predictions; any proper scoring rule admits a [unique decomposition][2] as `Score = Uncertainty - Resolution + Reliability`, where: * `Uncertainty`, is a generalized entropy of the average predictive distribution; it can both be positive or negative. * `Resolution`, is a generalized variance of individual predictive distributions; it is always non-negative. Difference in predictions reveal information, that is why a larger resolution improves the predictive score. * `Reliability`, a measure of calibration of predictions against the true frequency of events. It is always non-negative and a lower value here indicates better calibration. This method estimates the above decomposition for the case of the Brier scoring rule for discrete outcomes. For this, we need to discretize the space of probability distributions; we choose a simple partition of the space into `nlabels` events: given a distribution `p` over `nlabels` outcomes, the index `k` for which `p_k > p_i` for all `i != k` determines the discretization outcome; that is, `p in M_k`, where `M_k` is the set of all distributions for which `p_k` is the largest value among all probabilities. The estimation error of each component is O(k/n), where n is the number of instances and k is the number of labels. There may be an error of this order when compared to `brier_score`. #### References [1]: Tilmann Gneiting, Adrian E. Raftery. Strictly Proper Scoring Rules, Prediction, and Estimation. Journal of the American Statistical Association, Vol. 102, 2007. https://www.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf [2]: Jochen Broecker. Reliability, sufficiency, and the decomposition of proper scores. Quarterly Journal of the Royal Meteorological Society, Vol. 135, 2009. https://rmets.onlinelibrary.wiley.com/doi/epdf/10.1002/qj.456 Args: labels: Tensor, (n,), with tf.int32 or tf.int64 elements containing ground truth class labels in the range [0,nlabels]. logits: Tensor, (n, nlabels), with logits for n instances and nlabels. name: Python `str` name prefixed to Ops created by this function. Returns: uncertainty: Tensor, scalar, the uncertainty component of the decomposition. resolution: Tensor, scalar, the resolution component of the decomposition. reliability: Tensor, scalar, the reliability component of the decomposition. """ with tf.name_scope(name or 'brier_decomposition'): labels = tf.convert_to_tensor(labels) logits = tf.convert_to_tensor(logits) num_classes = logits.shape[-1] # Compute pbar, the average distribution pred_class = tf.argmax(logits, axis=-1, output_type=labels.dtype) if tensorshape_util.rank(logits.shape) > 2: flatten, unflatten = _make_flatten_unflatten_fns(logits.shape[:-2]) def fn_to_map(args): yhat, y = args return tf.math.confusion_matrix(yhat, y, num_classes=num_classes, dtype=logits.dtype) confusion_matrix = tf.map_fn( fn_to_map, [flatten(pred_class), flatten(labels)], dtype=logits.dtype) confusion_matrix = unflatten(confusion_matrix) else: confusion_matrix = tf.math.confusion_matrix( pred_class, labels, num_classes=num_classes, dtype=logits.dtype) dist_weights = tf.reduce_sum(confusion_matrix, axis=-1) dist_weights /= tf.reduce_sum(dist_weights, axis=-1, keepdims=True) pbar = tf.reduce_sum(confusion_matrix, axis=-2) pbar /= tf.reduce_sum(pbar, axis=-1, keepdims=True) eps = np.finfo(dtype_util.as_numpy_dtype(confusion_matrix.dtype)).eps # dist_mean[k,:] contains the empirical distribution for the set M_k # Some outcomes may not realize, corresponding to dist_weights[k] = 0 dist_mean = confusion_matrix / ( eps + tf.reduce_sum(confusion_matrix, axis=-1, keepdims=True)) # Uncertainty: quadratic entropy of the average label distribution uncertainty = -tf.reduce_sum(tf.square(pbar), axis=-1) # Resolution: expected quadratic divergence of predictive to mean resolution = tf.square(tf.expand_dims(pbar, -1) - dist_mean) resolution = tf.reduce_sum(dist_weights * tf.reduce_sum(resolution, axis=-1), axis=-1) # Reliability: expected quadratic divergence of predictive to true if tensorshape_util.rank(logits.shape) > 2: # TODO(b/139094519): Avoid using tf.map_fn here. prob_true = tf.map_fn( lambda args: tf.gather(args[0], args[1]), [flatten(dist_mean), flatten(pred_class)], dtype=dist_mean.dtype) prob_true = unflatten(prob_true) else: prob_true = tf.gather(dist_mean, pred_class, axis=0) log_prob_true = tf.math.log(prob_true) log_prob_pred = logits - tf.math.reduce_logsumexp( logits, axis=-1, keepdims=True) log_reliability = _reduce_log_l2_exp(log_prob_pred, log_prob_true, axis=-1) log_reliability = tf.math.reduce_logsumexp( log_reliability, axis=-1, ) num_samples = tf.cast(tf.shape(logits)[-2], logits.dtype) reliability = tf.exp(log_reliability - tf.math.log(num_samples)) return uncertainty, resolution, reliability
def _fn(*fargs, **fkwargs): d = fn(*fargs, **fkwargs) x = tf.convert_to_tensor(d) d.shape = x.shape d.get_shape = x.get_shape return d, x
def find_root_chandrupatla(objective_fn, low, high, position_tolerance=1e-8, value_tolerance=0., max_iterations=50, stopping_policy_fn=tf.reduce_all, validate_args=False, name='find_root_chandrupatla'): r"""Finds root(s) of a scalar function using Chandrupatla's method. Chandrupatla's method [1, 2] is a root-finding algorithm that is guaranteed to converge if a root lies within the given bounds. It generalizes the [bisection method](https://en.wikipedia.org/wiki/Bisection_method); at each step it chooses to perform either bisection or inverse quadratic interpolation. This makes it similar in spirit to [Brent's method]( https://en.wikipedia.org/wiki/Brent%27s_method), which also considers steps that use the secant method, but Chandrupatla's method is simpler and often converges at least as quickly [3]. Args: objective_fn: Python callable for which roots are searched. It must be a callable of a single variable. `objective_fn` must return a `Tensor` with shape `batch_shape` and dtype matching `lower_bound` and `upper_bound`. low: Float `Tensor` of shape `batch_shape` representing a lower bound(s) on the value of a root(s). high: Float `Tensor` of shape `batch_shape` representing an upper bound(s) on the value of a root(s). position_tolerance: Optional `Tensor` representing the maximum absolute error in the positions of the estimated roots. Shape must broadcast with `batch_shape`. Default value: `1e-8`. value_tolerance: Optional `Tensor` representing the absolute error allowed in the value of the objective function. If the absolute value of `objective_fn` is smaller than `value_tolerance` at a given position, then that position is considered a root for the function. Shape must broadcast with `batch_shape`. Default value: `1e-8`. max_iterations: Optional `Tensor` or Python integer specifying the maximum number of steps to perform. Shape must broadcast with `batch_shape`. Default value: `50`. stopping_policy_fn: Python `callable` controlling the algorithm termination. It must be a callable accepting a `Tensor` of booleans with the same shape as `lower_bound` and `upper_bound` (denoting whether each search is finished), and returning a scalar boolean `Tensor` indicating whether the overall search should stop. Typical values are `tf.reduce_all` (which returns only when the search is finished for all points), and `tf.reduce_any` (which returns as soon as the search is finished for any point). Default value: `tf.reduce_all` (returns only when the search is finished for all points). validate_args: Python `bool` indicating whether to validate arguments. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Default value: 'find_root_chandrupatla'. Returns: root_search_results: A Python `namedtuple` containing the following items: estimated_root: `Tensor` containing the last position explored. If the search was successful within the specified tolerance, this position is a root of the objective function. objective_at_estimated_root: `Tensor` containing the value of the objective function at `position`. If the search was successful within the specified tolerance, then this is close to 0. num_iterations: The number of iterations performed. #### References [1] Tirupathi R. Chandrupatla. A new hybrid quadratic/bisection algorithm for finding the zero of a nonlinear function without using derivatives. _Advances in Engineering Software_, 28.3:145-149, 1997. [2] Philipp OJ Scherer. Computational Physics. _Springer Berlin_, Heidelberg, 2010. Section 6.1.7.3 https://books.google.com/books?id=cC-8BAAAQBAJ&pg=PA95 [3] Jason Sachs. Ten Little Algorithms, Part 5: Quadratic Extremum Interpolation and Chandrupatla's Method (2015). https://www.embeddedrelated.com/showarticle/855.php """ ################################################ # Loop variables used by Chandrupatla's method: # # a: endpoint of an interval `[min(a, b), max(a, b)]` containing the # root. There is no guarantee as to which of `a` and `b` is larger. # b: endpoint of an interval `[min(a, b), max(a, b)]` containing the # root. There is no guarantee as to which of `a` and `b` is larger. # f_a: value of the objective at `a`. # f_b: value of the objective at `b`. # t: the next position to be evaluated as the coefficient of a convex # combination of `a` and `b` (i.e., a value in the unit interval). # num_iterations: integer number of steps taken so far. # converged: boolean indicating whether each batch element has converged. # # All variables have the same shape `batch_shape`. def _should_continue(a, b, f_a, f_b, t, num_iterations, converged): del a, b, f_a, f_b, t # Unused. all_converged = stopping_policy_fn( tf.logical_or(converged, num_iterations >= max_iterations)) return ~all_converged def _body(a, b, f_a, f_b, t, num_iterations, converged): """One step of Chandrupatla's method for root finding.""" previous_loop_vars = (a, b, f_a, f_b, t, num_iterations, converged) finalized_elements = tf.logical_or(converged, num_iterations >= max_iterations) # Evaluate the new point. x_new = (1 - t) * a + t * b f_new = objective_fn(x_new) # If we've bisected (t==0.5) and the new float value for `a` is identical to # that from the previous iteration, then we'll keep bisecting (the # logic below will set t==0.5 for the next step), and nothing further will # change. at_fixed_point = tf.equal(x_new, a) & tf.equal(t, 0.5) # Otherwise, tighten the bounds. a, b, c, f_a, f_b, f_c = _structure_broadcasting_where( tf.equal(tf.math.sign(f_new), tf.math.sign(f_a)), (x_new, b, a, f_new, f_b, f_a), (x_new, a, b, f_new, f_a, f_b)) # Check for convergence. f_best = tf.where(tf.abs(f_a) < tf.abs(f_b), f_a, f_b) interval_tolerance = position_tolerance / (tf.abs(b - c)) converged = tf.logical_or( interval_tolerance > 0.5, tf.logical_or( tf.math.abs(f_best) <= value_tolerance, at_fixed_point)) # Propose next point to evaluate. xi = (a - b) / (c - b) phi = (f_a - f_b) / (f_c - f_b) t = tf.where( # Condition for inverse quadratic interpolation. tf.logical_and(1 - tf.math.sqrt(1 - xi) < phi, tf.math.sqrt(xi) > phi), # Propose a point by inverse quadratic interpolation. (f_a / (f_b - f_a) * f_c / (f_b - f_c) + (c - a) / (b - a) * f_a / (f_c - f_a) * f_b / (f_c - f_b)), # Otherwise, just cut the interval in half (bisection). 0.5) # Constrain the proposal to the current interval (0 < t < 1). t = tf.minimum(tf.maximum(t, interval_tolerance), 1 - interval_tolerance) # Update elements that haven't converged. return _structure_broadcasting_where( finalized_elements, previous_loop_vars, (a, b, f_a, f_b, t, num_iterations + 1, converged)) with tf.name_scope(name): max_iterations = tf.convert_to_tensor(max_iterations, name='max_iterations', dtype_hint=tf.int32) a = tf.convert_to_tensor(low, name='lower_bound') b = tf.convert_to_tensor(high, name='upper_bound') f_a, f_b = objective_fn(a), objective_fn(b) batch_shape = ps.broadcast_shape(ps.shape(f_a), ps.shape(f_b)) assertions = [] if validate_args: assertions += [ assert_util.assert_none_equal( tf.math.sign(f_a), tf.math.sign(f_b), message='Bounds must be on different sides of a root.') ] with tf.control_dependencies(assertions): initial_loop_vars = [ a, b, f_a, f_b, tf.cast(0.5, dtype=f_a.dtype), tf.cast(0, dtype=max_iterations.dtype), False ] a, b, f_a, f_b, _, num_iterations, _ = tf.while_loop( _should_continue, _body, loop_vars=tf.nest.map_structure( lambda x: tf.broadcast_to(x, batch_shape), initial_loop_vars)) x_best, f_best = _structure_broadcasting_where( tf.abs(f_a) < tf.abs(f_b), (a, f_a), (b, f_b)) return RootSearchResults(estimated_root=x_best, objective_at_estimated_root=f_best, num_iterations=num_iterations)
def test_enables_nontensor_plumbing(self): if tf.executing_eagerly(): self.skipTest('`compile` functionality changed.') # Setup. class Foo: def __init__(self, input_): self._input = input_ self.value = tf.convert_to_tensor([[42.]]) @property def dtype(self): return self.value.dtype tf.register_tensor_conversion_function( Foo, lambda x, *args, **kwargs: x.value) tf_utils.register_symbolic_tensor_type(Foo) class PlumbingLayer(keras.layers.Lambda): def __init__(self, fn, **kwargs): def _fn(*fargs, **fkwargs): d = fn(*fargs, **fkwargs) x = tf.convert_to_tensor(d) d.shape = x.shape d.get_shape = x.get_shape return d, x super(PlumbingLayer, self).__init__(_fn, **kwargs) self._enter_dunder_call = False def __call__(self, inputs, *args, **kwargs): self._enter_dunder_call = True d, _ = super(PlumbingLayer, self).__call__(inputs, *args, **kwargs) self._enter_dunder_call = False return d def call(self, inputs, *args, **kwargs): d, v = super(PlumbingLayer, self).call(inputs, *args, **kwargs) if self._enter_dunder_call: return d, v return d # User-land. model = keras.Sequential([ keras.layers.InputLayer((1, )), PlumbingLayer(Foo), # Makes a `Foo` object. ]) # Let's ensure Keras graph history is preserved by composing the models. model = keras.Model(model.inputs, model(model.outputs)) # Now we instantiate the model and verify we have a `Foo` object, not a # `Tensor`. y = model(tf.convert_to_tensor([[7.]])) self.assertIsInstance(y, Foo) # Confirm that (custom) loss sees `Foo` instance, not Tensor. obtained_prediction_box = [None] def custom_loss(y_obs, y_pred): del y_obs obtained_prediction_box[0] = y_pred return y_pred # Apparently `compile` calls the loss function enough to trigger the # side-effect. model.compile('SGD', loss=custom_loss) self.assertIsInstance(obtained_prediction_box[0], Foo)
def find_root_secant(objective_fn, initial_position, next_position=None, value_at_position=None, position_tolerance=1e-8, value_tolerance=1e-8, max_iterations=50, stopping_policy_fn=tf.reduce_all, validate_args=False, name=None): r"""Finds root(s) of a function of single variable using the secant method. The [secant method](https://en.wikipedia.org/wiki/Secant_method) is a root-finding algorithm that uses a succession of roots of secant lines to better approximate a root of a function. The secant method can be thought of as a finite-difference approximation of Newton's method. Args: objective_fn: Python callable for which roots are searched. It must be a callable of a single variable. `objective_fn` must return a `Tensor` of the same shape and dtype as `initial_position`. initial_position: `Tensor` or Python float representing the starting position. The function will search for roots in the neighborhood of each point. The shape of `initial_position` should match that of the input to `objective_fn`. next_position: Optional `Tensor` representing the next position in the search. If specified, this argument must broadcast with the shape of `initial_position` and have the same dtype. It will be used to compute the first step to take when searching for roots. If not specified, a default value will be used instead. Default value: `initial_position * (1 + 1e-4) + sign(initial_position) * 1e-4`. value_at_position: Optional `Tensor` or Python float representing the value of `objective_fn` at `initial_position`. If specified, this argument must have the same shape and dtype as `initial_position`. If not specified, the value will be evaluated during the search. Default value: None. position_tolerance: Optional `Tensor` representing the tolerance for the estimated roots. If specified, this argument must broadcast with the shape of `initial_position` and have the same dtype. Default value: `1e-8`. value_tolerance: Optional `Tensor` representing the tolerance used to check for roots. If the absolute value of `objective_fn` is smaller than `value_tolerance` at a given position, then that position is considered a root for the function. If specified, this argument must broadcast with the shape of `initial_position` and have the same dtype. Default value: `1e-8`. max_iterations: Optional `Tensor` or Python integer specifying the maximum number of steps to perform for each initial position. Must broadcast with the shape of `initial_position`. Default value: `50`. stopping_policy_fn: Python `callable` controlling the algorithm termination. It must be a callable accepting a `Tensor` of booleans with the shape of `initial_position` (each denoting whether the search is finished for each starting point), and returning a scalar boolean `Tensor` (indicating whether the overall search should stop). Typical values are `tf.reduce_all` (which returns only when the search is finished for all points), and `tf.reduce_any` (which returns as soon as the search is finished for any point). Default value: `tf.reduce_all` (returns only when the search is finished for all points). validate_args: Python `bool` indicating whether to validate arguments such as `position_tolerance`, `value_tolerance`, and `max_iterations`. Default value: `False`. name: Python `str` name prefixed to ops created by this function. Returns: root_search_results: A Python `namedtuple` containing the following items: estimated_root: `Tensor` containing the last position explored. If the search was successful within the specified tolerance, this position is a root of the objective function. objective_at_estimated_root: `Tensor` containing the value of the objective function at `position`. If the search was successful within the specified tolerance, then this is close to 0. num_iterations: The number of iterations performed. Raises: ValueError: if a non-callable `stopping_policy_fn` is passed. #### Examples ```python import tensorflow as tf import tensorflow_probability as tfp tf.enable_eager_execution() # Example 1: Roots of a single function from two different starting points. f = lambda x: (63 * x**5 - 70 * x**3 + 15 * x) / 8. x = tf.constant([-1, 10], dtype=tf.float64) tfp.math.secant_root(objective_fn=f, initial_position=x)) # ==> RootSearchResults( estimated_root=array([-0.90617985, 0.90617985]), objective_at_estimated_root=array([-4.81727769e-10, 7.44957651e-10]), num_iterations=array([ 7, 24], dtype=int32)) tfp.math.secant_root(objective_fn=f, initial_position=x, stopping_policy_fn=tf.reduce_any) # ==> RootSearchResults( estimated_root=array([-0.90617985, 3.27379206]), objective_at_estimated_root=array([-4.81727769e-10, 2.66058312e+03]), num_iterations=array([7, 8], dtype=int32)) # Example 2: Roots of a multiplex function from a single starting point. def f(x): return tf.constant([0., 63. / 8], dtype=tf.float64) * x**5 \ + tf.constant([5. / 2, -70. / 8], dtype=tf.float64) * x**3 \ + tf.constant([-3. / 2, 15. / 8], dtype=tf.float64) * x x = tf.constant([-1, -1], dtype=tf.float64) tfp.math.secant_root(objective_fn=f, initial_position=x) # ==> RootSearchResults( estimated_root=array([-0.77459667, -0.90617985]), objective_at_estimated_root=array([-7.81339438e-11, -4.81727769e-10]), num_iterations=array([7, 7], dtype=int32)) # Example 3: Roots of a multiplex function from two starting points. def f(x): return tf.constant([0., 63. / 8], dtype=tf.float64) * x**5 \ + tf.constant([5. / 2, -70. / 8], dtype=tf.float64) * x**3 \ + tf.constant([-3. / 2, 15. / 8], dtype=tf.float64) * x x = tf.constant([[-1, -1], [10, 10]], dtype=tf.float64) tfp.math.secant_root(objective_fn=f, initial_position=x) # ==> RootSearchResults( estimated_root=array([ [-0.77459667, -0.90617985], [ 0.77459667, 0.90617985]]), objective_at_estimated_root=array([ [-7.81339438e-11, -4.81727769e-10], [6.66025013e-11, 7.44957651e-10]]), num_iterations=array([ [7, 7], [16, 24]], dtype=int32)) ``` """ if not callable(stopping_policy_fn): raise ValueError('stopping_policy_fn must be callable') position = tf.convert_to_tensor( initial_position, name='position', ) value_at_position = tf.convert_to_tensor( value_at_position or objective_fn(position), name='value_at_position', dtype=dtype_util.base_dtype(position.dtype)) zero = tf.zeros_like(position) position_tolerance = tf.convert_to_tensor(position_tolerance, name='position_tolerance', dtype=position.dtype) value_tolerance = tf.convert_to_tensor(value_tolerance, name='value_tolerance', dtype=position.dtype) num_iterations = tf.zeros_like(position, dtype=tf.int32) max_iterations = tf.convert_to_tensor(max_iterations, dtype=tf.int32) max_iterations = tf.broadcast_to(max_iterations, name='max_iterations', shape=position.shape) # Compute the step from `next_position` if present. This covers the case where # a user has two starting points, which bound the root or has a specific step # size in mind. if next_position is None: epsilon = tf.constant(1e-4, dtype=position.dtype, shape=position.shape) step = position * epsilon + tf.sign(position) * epsilon else: step = next_position - initial_position finished = tf.constant(False, shape=position.shape) # Negate `stopping_condition` to determine if the search should continue. # This means, in particular, that tf.reduce_*all* will return only when the # search is finished for *all* starting points. def _should_continue(position, value_at_position, num_iterations, step, finished): """Indicates whether the overall search should continue. Args: position: `Tensor` containing the current root estimates. value_at_position: `Tensor` containing the value of `objective_fn` at `position`. num_iterations: `Tensor` containing the current iteration index for each point. step: `Tensor` containing the size of the step to take for each point. finished: `Tensor` indicating for which points the search is finished. Returns: A boolean value indicating whether the overall search should continue. """ del position, value_at_position, num_iterations, step # Unused return ~tf.convert_to_tensor( stopping_policy_fn(finished), name='should_stop', dtype=tf.bool) # For each point in `position`, the search is stopped if either: # (1) A root has been found # (2) f(position) == f(position + step) # (3) The maximum number of iterations has been reached # In case (2), the search may be stopped both before the desired tolerance is # achieved (or even a root is found), and the maximum number of iterations is # reached. def _body(position, value_at_position, num_iterations, step, finished): """Performs one iteration of the secant root-finding algorithm. Args: position: `Tensor` containing the current root estimates. value_at_position: `Tensor` containing the value of `objective_fn` at `position`. num_iterations: `Tensor` containing the current iteration index for each point. step: `Tensor` containing the size of the step to take for each point. finished: `Tensor` indicating for which points the search is finished. Returns: The `Tensor`s to use for the next iteration of the algorithm. """ # True if the search was already finished, or (1) or (3) just became true. was_finished = finished | (num_iterations >= max_iterations) | ( tf.abs(step) < position_tolerance) | (tf.abs(value_at_position) < value_tolerance) # Compute the next position and the value at that point. next_position = tf.where(was_finished, position, position + step) value_at_next_position = tf.where(was_finished, value_at_position, objective_fn(next_position)) # True if the search was already finished, or (2) just became true. is_finished = tf.equal(value_at_position, value_at_next_position) # Use the mid-point between the last two positions if (2) just became true. next_position = tf.where(is_finished & ~was_finished, (position + next_position) * 0.5, next_position) # Once finished, stop updating the iteration index and set the step to zero. num_iterations = tf.where(is_finished, num_iterations, num_iterations + 1) next_step = tf.where( is_finished, zero, step * value_at_next_position / (value_at_position - value_at_next_position)) return (next_position, value_at_next_position, num_iterations, next_step, is_finished) with tf.name_scope(name or 'find_root_secant'): assertions = [] if validate_args: assertions += [ tf.debugging.assert_greater( position_tolerance, zero, message='`position_tolerance` must be greater than 0.'), tf.debugging.assert_greater( value_tolerance, zero, message='`value_tolerance` must be greater than 0.'), tf.debugging.assert_greater_equal( max_iterations, num_iterations, message='`max_iterations` must be nonnegative.') ] with tf.control_dependencies(assertions): root, value_at_root, num_iterations, _, _ = tf.while_loop( cond=_should_continue, body=_body, loop_vars=(position, value_at_position, num_iterations, step, finished)) return RootSearchResults(estimated_root=root, objective_at_estimated_root=value_at_root, num_iterations=num_iterations)
def __init__(self, inner_kernel, num_adaptation_steps, target_accept_prob=0.75, adaptation_rate=0.01, step_size_setter_fn=hmc_like_step_size_setter_fn, step_size_getter_fn=hmc_like_step_size_getter_fn, log_accept_prob_getter_fn=hmc_like_log_accept_prob_getter_fn, validate_args=False, name=None): """Creates the step size adaptation kernel. The default setter_fn and the getter_fn callbacks assume that the inner kernel produces kernel results structurally the same as the `HamiltonianMonteCarlo` kernel. Args: inner_kernel: `TransitionKernel`-like object. num_adaptation_steps: Scalar `int` `Tensor` number of initial steps to during which to adjust the step size. This may be greater, less than, or equal to the number of burnin steps. target_accept_prob: A floating point `Tensor` representing desired acceptance probability. Must be a positive number less than 1. This can either be a scalar, or have shape [num_chains]. Default value: `0.75` (the [center of asymptotically optimal rate for HMC][1]). adaptation_rate: `Tensor` representing amount to scale the current `step_size`. step_size_setter_fn: A callable with the signature `(kernel_results, new_step_size) -> new_kernel_results` where `kernel_results` are the results of the `inner_kernel`, `new_step_size` is a `Tensor` or a nested collection of `Tensor`s with the same structure as returned by the `step_size_getter_fn`, and `new_kernel_results` are a copy of `kernel_results` with the step size(s) set. step_size_getter_fn: A callable with the signature `(kernel_results) -> step_size` where `kernel_results` are the results of the `inner_kernel`, and `step_size` is a floating point `Tensor` or a nested collection of such `Tensor`s. log_accept_prob_getter_fn: A callable with the signature `(kernel_results) -> log_accept_prob` where `kernel_results` are the results of the `inner_kernel`, and `log_accept_prob` is a floating point `Tensor`. `log_accept_prob` can either be a scalar, or have shape [num_chains]. If it's the latter, `step_size` should also have the same leading dimension. validate_args: Python `bool`. When `True` kernel parameters are checked for validity. When `False` invalid inputs may silently render incorrect outputs. name: Python `str` name prefixed to Ops created by this class. Default: 'simple_step_size_adaptation'. #### References [1]: Betancourt, M. J., Byrne, S., & Girolami, M. (2014). _Optimizing The Integrator Step Size for Hamiltonian Monte Carlo_. http://arxiv.org/abs/1411.6669 """ inner_kernel = mcmc_util.enable_store_parameters_in_results( inner_kernel) with tf.name_scope( mcmc_util.make_name(name, 'simple_step_size_adaptation', '__init__')) as name: dtype = dtype_util.common_dtype( [target_accept_prob, adaptation_rate], tf.float32) target_accept_prob = tf.convert_to_tensor( target_accept_prob, dtype=dtype, name='target_accept_prob') adaptation_rate = tf.convert_to_tensor(adaptation_rate, dtype=dtype, name='adaptation_rate') num_adaptation_steps = tf.convert_to_tensor( num_adaptation_steps, dtype=tf.int32, name='num_adaptation_steps') target_accept_prob = _maybe_validate_target_accept_prob( target_accept_prob, validate_args) self._parameters = dict( inner_kernel=inner_kernel, num_adaptation_steps=num_adaptation_steps, target_accept_prob=target_accept_prob, adaptation_rate=adaptation_rate, step_size_setter_fn=step_size_setter_fn, step_size_getter_fn=step_size_getter_fn, log_accept_prob_getter_fn=log_accept_prob_getter_fn, name=name, )
def _sample_n(self, n, seed=None): if self._use_static_graph: # This sampling approach is almost the same as the approach used by # `MixtureSameFamily`. The differences are due to having a list of # `Distribution` objects rather than a single object, and maintaining # random seed management that is consistent with the non-static code # path. samples = [] cat_samples = self.cat.sample(n, seed=seed) stream = SeedStream(seed, salt='Mixture') for c in range(self.num_components): samples.append(self.components[c].sample(n, seed=stream())) stack_axis = -1 - tensorshape_util.rank(self._static_event_shape) x = tf.stack(samples, axis=stack_axis) # [n, B, k, E] npdt = dtype_util.as_numpy_dtype(x.dtype) mask = tf.one_hot( indices=cat_samples, # [n, B] depth=self._num_components, # == k on_value=npdt(1), off_value=npdt(0)) # [n, B, k] mask = distribution_util.pad_mixture_dimensions( mask, self, self._cat, tensorshape_util.rank( self._static_event_shape)) # [n, B, k, [1]*e] return tf.reduce_sum(x * mask, axis=stack_axis) # [n, B, E] n = tf.convert_to_tensor(n, name='n') static_n = tf.get_static_value(n) n = int(static_n) if static_n is not None else n cat_samples = self.cat.sample(n, seed=seed) static_samples_shape = cat_samples.shape if tensorshape_util.is_fully_defined(static_samples_shape): samples_shape = tensorshape_util.as_list(static_samples_shape) samples_size = tensorshape_util.num_elements(static_samples_shape) else: samples_shape = tf.shape(cat_samples) samples_size = tf.size(cat_samples) static_batch_shape = self.batch_shape if tensorshape_util.is_fully_defined(static_batch_shape): batch_shape = tensorshape_util.as_list(static_batch_shape) batch_size = tensorshape_util.num_elements(static_batch_shape) else: batch_shape = tf.shape(cat_samples)[1:] batch_size = tf.reduce_prod(batch_shape) static_event_shape = self.event_shape if tensorshape_util.is_fully_defined(static_event_shape): event_shape = np.array( tensorshape_util.as_list(static_event_shape), dtype=np.int32) else: event_shape = None # Get indices into the raw cat sampling tensor. We will # need these to stitch sample values back out after sampling # within the component partitions. samples_raw_indices = tf.reshape(tf.range(0, samples_size), samples_shape) # Partition the raw indices so that we can use # dynamic_stitch later to reconstruct the samples from the # known partitions. partitioned_samples_indices = tf.dynamic_partition( data=samples_raw_indices, partitions=cat_samples, num_partitions=self.num_components) # Copy the batch indices n times, as we will need to know # these to pull out the appropriate rows within the # component partitions. batch_raw_indices = tf.reshape(tf.tile(tf.range(0, batch_size), [n]), samples_shape) # Explanation of the dynamic partitioning below: # batch indices are i.e., [0, 1, 0, 1, 0, 1] # Suppose partitions are: # [1 1 0 0 1 1] # After partitioning, batch indices are cut as: # [batch_indices[x] for x in 2, 3] # [batch_indices[x] for x in 0, 1, 4, 5] # i.e. # [1 1] and [0 0 0 0] # Now we sample n=2 from part 0 and n=4 from part 1. # For part 0 we want samples from batch entries 1, 1 (samples 0, 1), # and for part 1 we want samples from batch entries 0, 0, 0, 0 # (samples 0, 1, 2, 3). partitioned_batch_indices = tf.dynamic_partition( data=batch_raw_indices, partitions=cat_samples, num_partitions=self.num_components) samples_class = [None for _ in range(self.num_components)] stream = SeedStream(seed, salt='Mixture') for c in range(self.num_components): n_class = tf.size(partitioned_samples_indices[c]) samples_class_c = self.components[c].sample(n_class, seed=stream()) if event_shape is None: batch_ndims = prefer_static.rank_from_shape(batch_shape) event_shape = tf.shape(samples_class_c)[1 + batch_ndims:] # Pull out the correct batch entries from each index. # To do this, we may have to flatten the batch shape. # For sample s, batch element b of component c, we get the # partitioned batch indices from # partitioned_batch_indices[c]; and shift each element by # the sample index. The final lookup can be thought of as # a matrix gather along locations (s, b) in # samples_class_c where the n_class rows correspond to # samples within this component and the batch_size columns # correspond to batch elements within the component. # # Thus the lookup index is # lookup[c, i] = batch_size * s[i] + b[c, i] # for i = 0 ... n_class[c] - 1. lookup_partitioned_batch_indices = ( batch_size * tf.range(n_class) + partitioned_batch_indices[c]) samples_class_c = tf.reshape( samples_class_c, tf.concat([[n_class * batch_size], event_shape], 0)) samples_class_c = tf.gather(samples_class_c, lookup_partitioned_batch_indices, name='samples_class_c_gather') samples_class[c] = samples_class_c # Stitch back together the samples across the components. lhs_flat_ret = tf.dynamic_stitch(indices=partitioned_samples_indices, data=samples_class) # Reshape back to proper sample, batch, and event shape. ret = tf.reshape(lhs_flat_ret, tf.concat([samples_shape, event_shape], 0)) tensorshape_util.set_shape( ret, tensorshape_util.concatenate(static_samples_shape, self.event_shape)) return ret
def cholesky_covariance(x, sample_axis=0, keepdims=False, name=None): """Cholesky factor of the covariance matrix of vector-variate random samples. This function can be use to fit a multivariate normal to data. ```python tf.enable_eager_execution() import tensorflow_probability as tfp tfd = tfp.distributions # Assume data.shape = (1000, 2). 1000 samples of a random variable in R^2. observed_data = read_data_samples(...) # The mean is easy mu = tf.reduce_mean(observed_data, axis=0) # Get the scale matrix L = tfp.stats.cholesky_covariance(observed_data) # Make the best fit multivariate normal (under maximum likelihood condition). mvn = tfd.MultivariateNormalTriL(loc=mu, scale_tril=L) # Plot contours of the pdf. xs, ys = tf.meshgrid( tf.linspace(-5., 5., 50), tf.linspace(-5., 5., 50), indexing='ij') xy = tf.stack((tf.reshape(xs, [-1]), tf.reshape(ys, [-1])), axis=-1) pdf = tf.reshape(mvn.prob(xy), (50, 50)) CS = plt.contour(xs, ys, pdf, 10) plt.clabel(CS, inline=1, fontsize=10) ``` Why does this work? Given vector-variate random variables `X = (X1, ..., Xd)`, one may obtain the sample covariance matrix in `R^{d x d}` (see `tfp.stats.covariance`). The [Cholesky factor](https://en.wikipedia.org/wiki/Cholesky_decomposition) of this matrix is analogous to standard deviation for scalar random variables: Suppose `X` has covariance matrix `C`, with Cholesky factorization `C = L L^T` Then multiplying a vector of iid random variables which have unit variance by `L` produces a vector with covariance `L L^T`, which is the same as `X`. ```python observed_data = read_data_samples(...) L = tfp.stats.cholesky_covariance(observed_data, sample_axis=0) # Make fake_data with the same covariance as observed_data. uncorrelated_normal = tf.random.normal(shape=(500, 10)) fake_data = tf.linalg.matvec(L, uncorrelated_normal) ``` Args: x: Numeric `Tensor`. The rightmost dimension of `x` indexes events. E.g. dimensions of a random vector. sample_axis: Scalar or vector `Tensor` designating axis holding samples. Default value: `0` (leftmost dimension). Cannot be the rightmost dimension (since this indexes events). keepdims: Boolean. Whether to keep the sample axis as singletons. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., `'covariance'`). Returns: chol: `Tensor` of same `dtype` as `x`. The last two dimensions hold lower triangular matrices (the Cholesky factors). """ with tf.name_scope(name or 'cholesky_covariance'): sample_axis = tf.convert_to_tensor(sample_axis, dtype=tf.int32) cov = covariance(x, sample_axis=sample_axis, event_axis=-1, keepdims=keepdims) return tf.linalg.cholesky(cov)
def _log_prob_with_logcdf(self, y): low = None if self._low is None else tf.convert_to_tensor(self._low) high = None if self._high is None else tf.convert_to_tensor(self._high) return _logsum_expbig_minus_expsmall( self.log_cdf(y, low=low, high=high), self.log_cdf(y - 1, low=low, high=high))
def auto_correlation(x, axis=-1, max_lags=None, center=True, normalize=True, name='auto_correlation'): """Auto correlation along one axis. Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation `RXX` may be defined as (with `E` expectation and `Conj` complex conjugate) ``` RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) }, W[n] := (X[n] - MU) / S, MU := E{ X[0] }, S**2 := E{ (X[0] - MU) Conj(X[0] - MU) }. ``` This function takes the viewpoint that `x` is (along one axis) a finite sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an estimate of `RXX[m]` as follows: After extending `x` from length `L` to `inf` by zero padding, the auto correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as ``` rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]), w[n] := (x[n] - mu) / s, mu := L**-1 sum_n x[n], s**2 := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu) ``` The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users often set `max_lags` small enough so that the entire output is meaningful. Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation contains a slight bias, which goes to zero as `len(x) - m --> infinity`. Args: x: `float32` or `complex64` `Tensor`. axis: Python `int`. The axis number along which to compute correlation. Other dimensions index different batch members. max_lags: Positive `int` tensor. The maximum value of `m` to consider (in equation above). If `max_lags >= x.shape[axis]`, we effectively re-set `max_lags` to `x.shape[axis] - 1`. center: Python `bool`. If `False`, do not subtract the mean estimate `mu` from `x[n]` when forming `w[n]`. normalize: Python `bool`. If `False`, do not divide by the variance estimate `s**2` when forming `w[n]`. name: `String` name to prepend to created ops. Returns: `rxx`: `Tensor` of same `dtype` as `x`. `rxx.shape[i] = x.shape[i]` for `i != axis`, and `rxx.shape[axis] = max_lags + 1`. Raises: TypeError: If `x` is not a supported type. """ # Implementation details: # Extend length N / 2 1-D array x to length N by zero padding onto the end. # Then, set # F[x]_k := sum_n x_n exp{-i 2 pi k n / N }. # It is not hard to see that # F[x]_k Conj(F[x]_k) = F[R]_k, where # R_m := sum_n x_n Conj(x_{(n - m) mod N}). # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m]. # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT # based version of estimating RXX. # Note that this is a special case of the Wiener-Khinchin Theorem. with tf.name_scope(name): x = tf.convert_to_tensor(x, name='x') # Rotate dimensions of x in order to put axis at the rightmost dim. # FFT op requires this. rank = ps.rank(x) if axis < 0: axis = rank + axis shift = rank - 1 - axis # Suppose x.shape[axis] = T, so there are T 'time' steps. # ==> x_rotated.shape = B + [T], # where B is x_rotated's batch shape. x_rotated = distribution_util.rotate_transpose(x, shift) if center: x_rotated = x_rotated - tf.reduce_mean( x_rotated, axis=-1, keepdims=True) # x_len = N / 2 from above explanation. The length of x along axis. # Get a value for x_len that works in all cases. x_len = ps.shape(x_rotated)[-1] # TODO(langmore) Investigate whether this zero padding helps or hurts. At # the moment is necessary so that all FFT implementations work. # Zero pad to the next power of 2 greater than 2 * x_len, which equals # 2**(ceil(Log_2(2 * x_len))). Note: Log_2(X) = Log_e(X) / Log_e(2). x_len_float64 = ps.cast(x_len, np.float64) target_length = ps.pow(np.float64(2.), ps.ceil(ps.log(x_len_float64 * 2) / np.log(2.))) pad_length = ps.cast(target_length - x_len_float64, np.int32) # We should have: # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length] # = B + [T + pad_length] x_rotated_pad = distribution_util.pad(x_rotated, axis=-1, back=True, count=pad_length) dtype = x.dtype if not dtype_util.is_complex(dtype): if not dtype_util.is_floating(dtype): raise TypeError( 'Argument x must have either float or complex dtype' ' found: {}'.format(dtype)) x_rotated_pad = tf.complex( x_rotated_pad, dtype_util.as_numpy_dtype(dtype_util.real_dtype(dtype))(0.)) # Autocorrelation is IFFT of power-spectral density (up to some scaling). fft_x_rotated_pad = tf.signal.fft(x_rotated_pad) spectral_density = fft_x_rotated_pad * tf.math.conj(fft_x_rotated_pad) # shifted_product is R[m] from above detailed explanation. # It is the inner product sum_n X[n] * Conj(X[n - m]). shifted_product = tf.signal.ifft(spectral_density) # Cast back to real-valued if x was real to begin with. shifted_product = tf.cast(shifted_product, dtype) # Figure out if we can deduce the final static shape, and set max_lags. # Use x_rotated as a reference, because it has the time dimension in the far # right, and was created before we performed all sorts of crazy shape # manipulations. know_static_shape = True if not tensorshape_util.is_fully_defined(x_rotated.shape): know_static_shape = False if max_lags is None: max_lags = x_len - 1 else: max_lags = tf.convert_to_tensor(max_lags, name='max_lags') max_lags_ = tf.get_static_value(max_lags) if max_lags_ is None or not know_static_shape: know_static_shape = False max_lags = tf.minimum(x_len - 1, max_lags) else: max_lags = min(x_len - 1, max_lags_) # Chop off the padding. # We allow users to provide a huge max_lags, but cut it off here. # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags] shifted_product_chopped = shifted_product[..., :max_lags + 1] # If possible, set shape. if know_static_shape: chopped_shape = tensorshape_util.as_list(x_rotated.shape) chopped_shape[-1] = min(x_len, max_lags + 1) tensorshape_util.set_shape(shifted_product_chopped, chopped_shape) # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]). The # other terms were zeros arising only due to zero padding. # `denominator = (N / 2 - m)` (defined below) is the proper term to # divide by to make this an unbiased estimate of the expectation # E[X[n] Conj(X[n - m])]. x_len = ps.cast(x_len, dtype_util.real_dtype(dtype)) max_lags = ps.cast(max_lags, dtype_util.real_dtype(dtype)) denominator = x_len - ps.range(0., max_lags + 1.) denominator = ps.cast(denominator, dtype) shifted_product_rotated = shifted_product_chopped / denominator if normalize: shifted_product_rotated /= shifted_product_rotated[..., :1] # Transpose dimensions back to those of x. return distribution_util.rotate_transpose(shifted_product_rotated, -shift)
def _prob_with_cdf(self, y): low = None if self._low is None else tf.convert_to_tensor(self._low) high = None if self._low is None else tf.convert_to_tensor(self._low) return self._cdf(y, low=low, high=high) - self._cdf( y - 1, low=low, high=high)
def sqrt_with_finite_grads(x, name=None): """A sqrt function whose gradient at zero is very large but finite. Args: x: a `Tensor` whose sqrt is to be computed. name: a Python `str` prefixed to all ops created by this function. Default `None` (i.e., "sqrt_with_finite_grads"). Returns: sqrt: the square root of `x`, with an overridden gradient at zero grad: a gradient function, which is the same as sqrt's gradient everywhere except at zero, where it is given a large finite value, instead of `inf`. Raises: TypeError: if `tf.convert_to_tensor(x)` is not a `float` type. Often in kernel functions, we need to compute the L2 norm of the difference between two vectors, `x` and `y`: `sqrt(sum_i((x_i - y_i) ** 2))`. In the case where `x` and `y` are identical, e.g., on the diagonal of a kernel matrix, we get `NaN`s when we take gradients with respect to the inputs. To see, this consider the forward pass: ``` [x_1 ... x_N] --> [x_1 ** 2 ... x_N ** 2] --> (x_1 ** 2 + ... + x_N ** 2) --> sqrt((x_1 ** 2 + ... + x_N ** 2)) ``` When we backprop through this forward pass, the `sqrt` yields an `inf` because `grad_z(sqrt(z)) = 1 / (2 * sqrt(z))`. Continuing the backprop to the left, at the `x ** 2` term, we pick up a `2 * x`, and when `x` is zero, we get `0 * inf`, which is `NaN`. We'd like to avoid these `NaN`s, since they infect the rest of the connected computation graph. Practically, when two inputs to a kernel function are equal, we are in one of two scenarios: 1. We are actually computing k(x, x), in which case norm(x - x) is identically zero, independent of x. In this case, we'd like the gradient to reflect this independence: it should be zero. 2. We are computing k(x, y), and x just *happens* to have the same value as y. The gradient at such inputs is in fact ill-defined (there is a cusp in the sqrt((x - y) ** 2) surface along the line x = y). There are, however, an infinite number of sub-gradients, all of which are valid at all such inputs. By symmetry, there is exactly one which is "special": zero, and we elect to use that value here. In practice, having two identical inputs to a kernel matrix is probably a pathological situation to be avoided, but that is better resolved at a higher level than this. To avoid the infinite gradient at zero, we use tf.custom_gradient to redefine the gradient at zero. We assign it to be a very large value, specifically the sqrt of the max value of the floating point dtype of the input. We use the sqrt (as opposed to just using the max floating point value) to avoid potential overflow when combining this value with others downstream. """ with tf.name_scope(name or 'sqrt_with_finite_grads'): x = tf.convert_to_tensor(value=x, name='x') if not x.dtype.is_floating: raise TypeError('Input `x` must be floating type.') def grad(grad_ys): large_float_like_x = np.sqrt( np.finfo(dtype_util.as_numpy_dtype(x.dtype)).max) safe_grads = tf.where(tf.equal(x, 0), large_float_like_x, 0.5 * tf.math.rsqrt(x)) return grad_ys * safe_grads return tf.sqrt(x), grad
def testAssertParamsAreFloats(self): loc = tf.convert_to_tensor(0, dtype=tf.int32) scale = tf.convert_to_tensor(1, dtype=tf.int32) with self.assertRaisesRegexp(ValueError, 'Expected floating point'): tfd.Laplace(loc=loc, scale=scale)
def soft_threshold(x, threshold, name=None): """Soft Thresholding operator. This operator is defined by the equations ```none { x[i] - gamma, x[i] > gamma SoftThreshold(x, gamma)[i] = { 0, x[i] == gamma { x[i] + gamma, x[i] < -gamma ``` In the context of proximal gradient methods, we have ```none SoftThreshold(x, gamma) = prox_{gamma L1}(x) ``` where `prox` is the proximity operator. Thus the soft thresholding operator is used in proximal gradient descent for optimizing a smooth function with (non-smooth) L1 regularization, as outlined below. The proximity operator is defined as: ```none prox_r(x) = argmin{ r(z) + 0.5 ||x - z||_2**2 : z }, ``` where `r` is a (weakly) convex function, not necessarily differentiable. Because the L2 norm is strictly convex, the above argmin is unique. One important application of the proximity operator is as follows. Let `L` be a convex and differentiable function with Lipschitz-continuous gradient. Let `R` be a convex lower semicontinuous function which is possibly nondifferentiable. Let `gamma` be an arbitrary positive real. Then ```none x_star = argmin{ L(x) + R(x) : x } ``` if and only if the fixed-point equation is satisfied: ```none x_star = prox_{gamma R}(x_star - gamma grad L(x_star)) ``` Proximal gradient descent thus typically consists of choosing an initial value `x^{(0)}` and repeatedly applying the update ```none x^{(k+1)} = prox_{gamma^{(k)} R}(x^{(k)} - gamma^{(k)} grad L(x^{(k)})) ``` where `gamma` is allowed to vary from iteration to iteration. Specializing to the case where `R(x) = ||x||_1`, we minimize `L(x) + ||x||_1` by repeatedly applying the update ``` x^{(k+1)} = SoftThreshold(x - gamma grad L(x^{(k)}), gamma) ``` (This idea can also be extended to second-order approximations, although the multivariate case does not have a known closed form like above.) Args: x: `float` `Tensor` representing the input to the SoftThreshold function. threshold: nonnegative scalar, `float` `Tensor` representing the radius of the interval on which each coordinate of SoftThreshold takes the value zero. Denoted `gamma` above. name: Python string indicating the name of the TensorFlow operation. Default value: `'soft_threshold'`. Returns: softthreshold: `float` `Tensor` with the same shape and dtype as `x`, representing the value of the SoftThreshold function. #### References [1]: Yu, Yao-Liang. The Proximity Operator. https://www.cs.cmu.edu/~suvrit/teach/yaoliang_proximity.pdf [2]: Wikipedia Contributors. Proximal gradient methods for learning. _Wikipedia, The Free Encyclopedia_, 2018. https://en.wikipedia.org/wiki/Proximal_gradient_methods_for_learning """ # https://math.stackexchange.com/questions/471339/derivation-of-soft-thresholding-operator with tf.name_scope(name or 'soft_threshold'): x = tf.convert_to_tensor(x, name='x') threshold = tf.convert_to_tensor(threshold, dtype=x.dtype, name='threshold') return tf.sign(x) * tf.maximum(tf.abs(x) - threshold, 0.)
def __init__(self, maturity_dates: types.DateTensor, discount_factors: tf.Tensor, valuation_date: types.DateTensor, interpolator: Optional[_InterpolationMethod] = None, interpolate_rates: Optional[bool] = True, daycount_convention: Optional[ _DayCountConventionsProtoType] = None, curve_type: Optional[curve_types.CurveType] = None, dtype: Optional[tf.DType] = None, name: Optional[str] = None): """Initializes the interest rate curve. Args: maturity_dates: A `DateTensor` containing the maturity dates on which the curve is specified. discount_factors: A `Tensor` of real dtype specifying the discount factors corresponding to the input maturities. The shape of this input should match the shape of `maturity_dates`. valuation_date: A scalar `DateTensor` specifying the valuation (or settlement) date for the curve. interpolator: An instance of `InterpolationMethod`. Default value: `None` in which case cubic interpolation is used. interpolate_rates: A boolean specifying whether the interpolation should be done in discount rates or discount factors space. Default value: `True`, i.e., interpolation is done in the discount factors space. daycount_convention: `DayCountConventions` to use for the interpolation purpose. Default value: `None` which maps to actual/365 day count convention. curve_type: An instance of `CurveTypes` to mark the rate curve. Default value: `None` which means that the curve does not have the marker. dtype: `tf.Dtype`. Optional input specifying the dtype of the `rates` input. name: Python str. The name to give to the ops created by this function. Default value: `None` which maps to 'rate_curve'. """ self._name = name or "rate_curve" with tf.compat.v1.name_scope(self._name): self._discount_factor_nodes = tf.convert_to_tensor( discount_factors, dtype=dtype, name="curve_discount_factors") self._dtype = dtype or self._discount_factor_nodes.dtype if interpolator is None or interpolator == _InterpolationMethod.CUBIC: def cubic_interpolator(xi, x, y): spline_coeffs = math.interpolation.cubic.build_spline(x, y) return math.interpolation.cubic.interpolate(xi, spline_coeffs, dtype=dtype) interpolator = cubic_interpolator self._interpolation_method = _InterpolationMethod.CUBIC elif interpolator == _InterpolationMethod.LINEAR: def linear_interpolator(xi, x, y): return math.interpolation.linear.interpolate(xi, x, y, dtype=dtype) interpolator = linear_interpolator self._interpolation_method = _InterpolationMethod.LINEAR elif interpolator == _InterpolationMethod.CONSTANT_FORWARD: def constant_fwd(xi, x, y): return rates_lib.constant_fwd.interpolate(xi, x, y, dtype=dtype) interpolator = constant_fwd self._interpolation_method = _InterpolationMethod.CONSTANT_FORWARD else: raise ValueError( f"Unknown interpolation method {interpolator}.") self._dates = dateslib.convert_to_date_tensor(maturity_dates) self._valuation_date = dateslib.convert_to_date_tensor( valuation_date) self._daycount_convention = (daycount_convention or _DayCountConventions.ACTUAL_365) self._day_count_fn = utils.get_daycount_fn( self._daycount_convention) self._times = self._get_time(self._dates) self._interpolator = interpolator self._interpolate_rates = interpolate_rates # Precompute discount rates: self._curve_type = curve_type