def __init__(self, loc):
   self._loc = tf.convert_to_tensor(loc)
   super(StatefulNormal, self).__init__(
       dtype=tf.float32, reparameterization_type=tfd.FULLY_REPARAMETERIZED,
       validate_args=False, allow_nan_stats=False)
Ejemplo n.º 2
0
def log_average_probs(logits,
                      sample_axis=0,
                      event_axis=None,
                      keepdims=False,
                      validate_args=False,
                      name=None):
    """Computes `log(average(to_probs(logits)))` in a numerically stable manner.

  The meaning of `to_probs` is controlled by the `event_axis` argument. When
  `event_axis` is `None`, `to_probs = tf.math.sigmoid` and otherwise
  `to_probs = lambda x: tf.math.log_softmax(x, axis=event_axis)`.

  `sample_axis` and `event_axis` should have a null intersection. This
  requirement is always verified when `validate_args` is `True`.

  Args:
    logits: A `float` `Tensor` representing logits.
    sample_axis: Scalar or vector `Tensor` designating axis holding samples, or
      `None` (meaning all axis hold samples).
      Default value: `0` (leftmost dimension).
    event_axis: Scalar or vector `Tensor` designating the axis representing
      categorical logits.
      Default value: `None` (i.e., Bernoulli logits).
    keepdims:  Boolean.  Whether to keep the sample axis as singletons.
      Default value: `False` (i.e., squeeze the reduced dimensions).
    validate_args: Python `bool`, default `False`. When `True` distribution
      parameters are checked for validity despite possibly degrading runtime
      performance. When `False` invalid inputs may silently render incorrect
      outputs.
      Default value: `False` (i.e., do not validate args).
    name: Python `str` name prefixed to Ops created by this function.
      Default value: `None` (i.e., `'log_average_probs'`).

  Returns:
    log_avg_probs: The natural log of the average of probs computed from logits.
  """
    with tf.name_scope(name or 'average_sigmoid'):
        logits = tf.convert_to_tensor(logits,
                                      dtype_hint=tf.float32,
                                      name='logits')
        if sample_axis is not None:
            sample_axis = tf.convert_to_tensor(sample_axis,
                                               dtype_hint=tf.int32,
                                               name='sample_axis')
        if event_axis is not None:
            event_axis = tf.convert_to_tensor(event_axis,
                                              dtype_hint=tf.int32,
                                              name='event_axis')
        if event_axis is None:
            # log(sigmoid(x)) = log(1 / (1 + exp(-x))) = -log1p(exp(-x)) = -sp(-x)
            log_probs = -tf.math.softplus(-logits)
        else:
            sample_axis, event_axis = _log_average_probs_process_args(
                logits, validate_args, sample_axis, event_axis)
            with tf.control_dependencies(
                    _log_average_probs_maybe_check_args(
                        sample_axis, event_axis, validate_args)):
                log_probs = _log_softmax(logits, axis=event_axis)
        return reduce_logmeanexp(log_probs,
                                 axis=sample_axis,
                                 keepdims=keepdims)
Ejemplo n.º 3
0
def reduce_weighted_logsumexp(logx,
                              w=None,
                              axis=None,
                              keep_dims=False,
                              return_sign=False,
                              name=None):
    """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`.

  If all weights `w` are known to be positive, it is more efficient to directly
  use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more
  efficient than `du.reduce_weighted_logsumexp(logx, w)`.

  Reduces `input_tensor` along the dimensions given in `axis`.
  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
  entry in `axis`. If `keep_dims` is true, the reduced dimensions
  are retained with length 1.

  If `axis` has no entries, all dimensions are reduced, and a
  tensor with a single element is returned.

  This function is more numerically stable than log(sum(w * exp(input))). It
  avoids overflows caused by taking the exp of large inputs and underflows
  caused by taking the log of small inputs.

  For example:

  ```python
  x = tf.constant([[0., 0, 0],
                   [0, 0, 0]])

  w = tf.constant([[-1., 1, 1],
                   [1, 1, 1]])

  du.reduce_weighted_logsumexp(x, w)
  # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4)

  du.reduce_weighted_logsumexp(x, w, axis=0)
  # ==> [log(-1+1), log(1+1), log(1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1)
  # ==> [log(-1+1+1), log(1+1+1)]

  du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True)
  # ==> [[log(-1+1+1)], [log(1+1+1)]]

  du.reduce_weighted_logsumexp(x, w, axis=[0, 1])
  # ==> log(-1+5)
  ```

  Args:
    logx: The tensor to reduce. Should have numeric type.
    w: The weight tensor. Should have numeric type identical to `logx`.
    axis: The dimensions to reduce. If `None` (the default), reduces all
      dimensions. Must be in the range `[-rank(input_tensor),
      rank(input_tensor))`.
    keep_dims: If true, retains reduced dimensions with length 1.
    return_sign: If `True`, returns the sign of the result.
    name: A name for the operation (optional).

  Returns:
    lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor.
    sign: (Optional) The sign of `sum(weight * exp(x))`.
  """
    with tf.name_scope(name or 'reduce_weighted_logsumexp'):
        logx = tf.convert_to_tensor(logx, name='logx')
        if w is None:
            lswe = tf.reduce_logsumexp(logx, axis=axis, keepdims=keep_dims)
            if return_sign:
                sgn = tf.ones_like(lswe)
                return lswe, sgn
            return lswe
        w = tf.convert_to_tensor(w, dtype=logx.dtype, name='w')
        log_absw_x = logx + tf.math.log(tf.abs(w))
        max_log_absw_x = tf.reduce_max(log_absw_x, axis=axis, keepdims=True)
        # If the largest element is `-inf` or `inf` then we don't bother subtracting
        # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That
        # this is ok follows from the fact that we're actually free to subtract any
        # value we like, so long as we add it back after taking the `log(sum(...))`.
        max_log_absw_x = tf.where(tf.math.is_inf(max_log_absw_x),
                                  tf.zeros([], max_log_absw_x.dtype),
                                  max_log_absw_x)
        wx_over_max_absw_x = (tf.sign(w) * tf.exp(log_absw_x - max_log_absw_x))
        sum_wx_over_max_absw_x = tf.reduce_sum(wx_over_max_absw_x,
                                               axis=axis,
                                               keepdims=keep_dims)
        if not keep_dims:
            max_log_absw_x = tf.squeeze(max_log_absw_x, axis)
        sgn = tf.sign(sum_wx_over_max_absw_x)
        lswe = max_log_absw_x + tf.math.log(sgn * sum_wx_over_max_absw_x)
        if return_sign:
            return lswe, sgn
        return lswe
Ejemplo n.º 4
0
  def __init__(self,
               num_timesteps,
               design_matrix,
               drift_scale,
               initial_state_prior,
               observation_noise_scale=0.,
               initial_step=0,
               validate_args=False,
               allow_nan_stats=True,
               name=None):
    """State space model for a dynamic linear regression.

    Args:
      num_timesteps: Scalar `int` `Tensor` number of timesteps to model
        with this distribution.
      design_matrix: float `Tensor` of shape `concat([batch_shape,
        [num_timesteps, num_features]])`.
      drift_scale: Scalar (any additional dimensions are treated as batch
        dimensions) `float` `Tensor` indicating the standard deviation of the
        latent state transitions.
      initial_state_prior: instance of `tfd.MultivariateNormal`
        representing the prior distribution on latent states.  Must have
        event shape `[num_features]`.
      observation_noise_scale: Scalar (any additional dimensions are
        treated as batch dimensions) `float` `Tensor` indicating the standard
        deviation of the observation noise.
        Default value: `0.`.
      initial_step: scalar `int` `Tensor` specifying the starting timestep.
        Default value: `0`.
      validate_args: Python `bool`. Whether to validate input with asserts. If
        `validate_args` is `False`, and the inputs are invalid, correct behavior
        is not guaranteed.
        Default value: `False`.
      allow_nan_stats: Python `bool`. If `False`, raise an
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
        batch member. If `True`, batch members with valid parameters leading to
        undefined statistics will return NaN for this statistic.
        Default value: `True`.
      name: Python `str` name prefixed to ops created by this class.
        Default value: 'DynamicLinearRegressionStateSpaceModel'.

    """

    with tf.name_scope(
        name or 'DynamicLinearRegressionStateSpaceModel') as name:
      dtype = dtype_util.common_dtype(
          [design_matrix, drift_scale, initial_state_prior])

      design_matrix = tf.convert_to_tensor(
          value=design_matrix, name='design_matrix', dtype=dtype)
      design_matrix_with_time_in_first_dim = distribution_util.move_dimension(
          design_matrix, -2, 0)

      drift_scale = tf.convert_to_tensor(
          value=drift_scale, name='drift_scale', dtype=dtype)

      observation_noise_scale = tf.convert_to_tensor(
          value=observation_noise_scale,
          name='observation_noise_scale',
          dtype=dtype)

      num_features = prefer_static.shape(design_matrix)[-1]

      def observation_matrix_fn(t):
        observation_matrix = tf.linalg.LinearOperatorFullMatrix(
            tf.gather(design_matrix_with_time_in_first_dim,
                      t)[..., tf.newaxis, :], name='observation_matrix')
        return observation_matrix

      self._drift_scale = drift_scale
      self._observation_noise_scale = observation_noise_scale

      super(DynamicLinearRegressionStateSpaceModel, self).__init__(
          num_timesteps=num_timesteps,
          transition_matrix=tf.linalg.LinearOperatorIdentity(
              num_rows=num_features,
              dtype=dtype,
              name='transition_matrix'),
          transition_noise=tfd.MultivariateNormalDiag(
              scale_diag=(drift_scale[..., tf.newaxis] *
                          tf.ones([num_features], dtype=dtype)),
              name='transition_noise'),
          observation_matrix=observation_matrix_fn,
          observation_noise=tfd.MultivariateNormalDiag(
              scale_diag=observation_noise_scale[..., tf.newaxis],
              name='observation_noise'),
          initial_state_prior=initial_state_prior,
          initial_step=initial_step,
          allow_nan_stats=allow_nan_stats,
          validate_args=validate_args,
          name=name)
Ejemplo n.º 5
0
def covariance(x,
               y=None,
               sample_axis=0,
               event_axis=-1,
               keepdims=False,
               name=None):
    """Sample covariance between observations indexed by `event_axis`.

  Given `N` samples of scalar random variables `X` and `Y`, covariance may be
  estimated as

  ```none
  Cov[X, Y] := N^{-1} sum_{n=1}^N (X_n - Xbar) Conj{(Y_n - Ybar)}
  Xbar := N^{-1} sum_{n=1}^N X_n
  Ybar := N^{-1} sum_{n=1}^N Y_n
  ```

  For vector-variate random variables `X = (X1, ..., Xd)`, `Y = (Y1, ..., Yd)`,
  one is often interested in the covariance matrix, `C_{ij} := Cov[Xi, Yj]`.

  ```python
  x = tf.random.normal(shape=(100, 2, 3))
  y = tf.random.normal(shape=(100, 2, 3))

  # cov[i, j] is the sample covariance between x[:, i, j] and y[:, i, j].
  cov = tfp.stats.covariance(x, y, sample_axis=0, event_axis=None)

  # cov_matrix[i, m, n] is the sample covariance of x[:, i, m] and y[:, i, n]
  cov_matrix = tfp.stats.covariance(x, y, sample_axis=0, event_axis=-1)
  ```

  Notice we divide by `N`, which does not create `NaN` when `N = 1`, but is
  slightly biased.

  Args:
    x:  A numeric `Tensor` holding samples.
    y:  Optional `Tensor` with same `dtype` and `shape` as `x`.
      Default value: `None` (`y` is effectively set to `x`).
    sample_axis: Scalar or vector `Tensor` designating axis holding samples, or
      `None` (meaning all axis hold samples).
      Default value: `0` (leftmost dimension).
    event_axis:  Scalar or vector `Tensor`, or `None` (scalar events).
      Axis indexing random events, whose covariance we are interested in.
      If a vector, entries must form a contiguous block of dims. `sample_axis`
      and `event_axis` should not intersect.
      Default value: `-1` (rightmost axis holds events).
    keepdims:  Boolean.  Whether to keep the sample axis as singletons.
    name: Python `str` name prefixed to Ops created by this function.
          Default value: `None` (i.e., `'covariance'`).

  Returns:
    cov: A `Tensor` of same `dtype` as the `x`, and rank equal to
      `rank(x) - len(sample_axis) + 2 * len(event_axis)`.

  Raises:
    AssertionError:  If `x` and `y` are found to have different shape.
    ValueError:  If `sample_axis` and `event_axis` are found to overlap.
    ValueError:  If `event_axis` is found to not be contiguous.
  """

    with tf.name_scope(name or 'covariance'):
        x = tf.convert_to_tensor(x, name='x')
        # Covariance *only* uses the centered versions of x (and y).
        x = x - tf.reduce_mean(x, axis=sample_axis, keepdims=True)

        if y is None:
            y = x
        else:
            y = tf.convert_to_tensor(y, name='y', dtype=x.dtype)
            # If x and y have different shape, sample_axis and event_axis will likely
            # be wrong for one of them!
            tensorshape_util.assert_is_compatible_with(x.shape, y.shape)
            y = y - tf.reduce_mean(y, axis=sample_axis, keepdims=True)

        if event_axis is None:
            return tf.reduce_mean(x * tf.math.conj(y),
                                  axis=sample_axis,
                                  keepdims=keepdims)

        if sample_axis is None:
            raise ValueError(
                'sample_axis was None, which means all axis hold events, and this '
                'overlaps with event_axis ({})'.format(event_axis))

        event_axis = _make_positive_axis(event_axis, ps.rank(x))
        sample_axis = _make_positive_axis(sample_axis, ps.rank(x))

        # If we get lucky and axis is statically defined, we can do some checks.
        if _is_list_like(event_axis) and _is_list_like(sample_axis):
            event_axis = tuple(map(int, event_axis))
            sample_axis = tuple(map(int, sample_axis))
            if set(event_axis).intersection(sample_axis):
                raise ValueError(
                    'sample_axis ({}) and event_axis ({}) overlapped'.format(
                        sample_axis, event_axis))
            if (np.diff(np.array(sorted(event_axis))) > 1).any():
                raise ValueError(
                    'event_axis must be contiguous. Found: {}'.format(
                        event_axis))
            batch_axis = list(
                sorted(
                    set(range(tensorshape_util.rank(
                        x.shape))).difference(sample_axis + event_axis)))
        else:
            batch_axis = ps.setdiff1d(ps.range(0, ps.rank(x)),
                                      ps.concat((sample_axis, event_axis), 0))

        event_axis = ps.cast(event_axis, dtype=tf.int32)
        sample_axis = ps.cast(sample_axis, dtype=tf.int32)
        batch_axis = ps.cast(batch_axis, dtype=tf.int32)

        # Permute x/y until shape = B + E + S
        perm_for_xy = ps.concat((batch_axis, event_axis, sample_axis), 0)
        x_permed = tf.transpose(a=x, perm=perm_for_xy)
        y_permed = tf.transpose(a=y, perm=perm_for_xy)

        batch_ndims = ps.size(batch_axis)
        batch_shape = ps.shape(x_permed)[:batch_ndims]
        event_ndims = ps.size(event_axis)
        event_shape = ps.shape(x_permed)[batch_ndims:batch_ndims + event_ndims]
        sample_shape = ps.shape(x_permed)[batch_ndims + event_ndims:]
        sample_ndims = ps.size(sample_shape)
        n_samples = ps.reduce_prod(sample_shape)
        n_events = ps.reduce_prod(event_shape)

        # Flatten sample_axis into one long dim.
        x_permed_flat = tf.reshape(
            x_permed, ps.concat((batch_shape, event_shape, [n_samples]), 0))
        y_permed_flat = tf.reshape(
            y_permed, ps.concat((batch_shape, event_shape, [n_samples]), 0))
        # Do the same for event_axis.
        x_permed_flat = tf.reshape(
            x_permed, ps.concat((batch_shape, [n_events], [n_samples]), 0))
        y_permed_flat = tf.reshape(
            y_permed, ps.concat((batch_shape, [n_events], [n_samples]), 0))

        # After matmul, cov.shape = batch_shape + [n_events, n_events]
        cov = tf.matmul(x_permed_flat, y_permed_flat,
                        adjoint_b=True) / ps.cast(n_samples, x.dtype)

        # Insert some singletons to make
        # cov.shape = batch_shape + event_shape**2 + [1,...,1]
        # This is just like x_permed.shape, except the sample_axis is all 1's, and
        # the [n_events] became event_shape**2.
        cov = tf.reshape(
            cov,
            ps.concat(
                (
                    batch_shape,
                    # event_shape**2 used here because it is the same length as
                    # event_shape, and has the same number of elements as one
                    # batch of covariance.
                    event_shape**2,
                    ps.ones([sample_ndims], tf.int32)),
                0))
        # Permuting by the argsort inverts the permutation, making
        # cov.shape have ones in the position where there were samples, and
        # [n_events * n_events] in the event position.
        cov = tf.transpose(a=cov, perm=ps.invert_permutation(perm_for_xy))

        # Now expand event_shape**2 into event_shape + event_shape.
        # We here use (for the first time) the fact that we require event_axis to be
        # contiguous.
        e_start = event_axis[0]
        e_len = 1 + event_axis[-1] - event_axis[0]
        cov = tf.reshape(
            cov,
            ps.concat((ps.shape(cov)[:e_start], event_shape, event_shape,
                       ps.shape(cov)[e_start + e_len:]), 0))

        # tf.squeeze requires python ints for axis, not Tensor.  This is enough to
        # require our axis args to be constants.
        if not keepdims:
            squeeze_axis = ps.where(sample_axis < e_start, sample_axis,
                                    sample_axis + e_len)
            cov = _squeeze(cov, axis=squeeze_axis)

        return cov
Ejemplo n.º 6
0
 def value(self):
     return tf.convert_to_tensor(42.)
Ejemplo n.º 7
0
 def __init__(self, input_):
     self._input = input_
     self.value = tf.convert_to_tensor([[42.]])
Ejemplo n.º 8
0
    def __init__(self,
                 leverage_fn,
                 variance_process,
                 risk_free_rate=None,
                 dividend_yield=None,
                 rho=None,
                 dtype=None,
                 name=None):
        """Initializes the Local stochastic volatility model.

    Args:
      leverage_fn: A Python callable which returns the Leverage function
        `L(t, S(t))` as a function of state and time. The function must accept
        a scalar `Tensor` corresponding to time 't' and a real `Tensor` of shape
        `[num_samples, 1]` corresponding to the underlying price (S) as
        inputs  and return a real `Tensor` containing the leverage function
        computed at (S,t).
      variance_process: An instance of `ItoProcess` specifying the
        dynamics of the variance process of the LSV model. The
        `variance_process` should implement a one-factor stochastic process.
        For the common version of Heston like variance model use
        `LSVVarianceModel`.
      risk_free_rate: An optional scalar real `Tensor` specifying the
        (continuously compounded) risk free interest rate. If the underlying is
        an FX rate, then use this input to specify the domestic interest rate.
        Note that the current implementation supports constant interest rates
        and dividend yield.
        Default value: `None` in which case the input is set to zero.
      dividend_yield: An optional real scalar `Tensor` specifying the
        (continuosly compounded) dividend yield. If the underlying is an FX
        rate, then use this input to specify the foreign interest rate.
        Note that the currect implementation supports constant interest rates
        and dividend yield.
        Default value: `None` in which case the input is set to zero.
      rho: A real scalar `Tensor` specifying the correlation between the
        underlying spot price and the variance process.
        Default value: `None` in which case cross correlations are assumed
        to be zero.
      dtype: The default dtype to use when converting values to `Tensor`s.
        Default value: `None` which means that default dtypes inferred by
        TensorFlow are used.
      name: Python string. The name to give to the ops created by this class.
        Default value: `None` which maps to the default name
        `local_stochastic_volatility_model`.
    """
        self._name = name or "local_stochastic_volatility_model"
        with tf.name_scope(self._name):
            if risk_free_rate is None:
                risk_free_rate = 0.0
            if dividend_yield is None:
                dividend_yield = 0.0
            self._risk_free_rate = tf.convert_to_tensor(risk_free_rate,
                                                        dtype=dtype)
            self._dtype = dtype or self._domestic_rate.dtype
            self._dividend_yield = tf.convert_to_tensor(dividend_yield,
                                                        dtype=dtype)
            self._leverage_fn = leverage_fn
            self._variance_process = variance_process
            dim = 1 + variance_process.dim()
            rho = rho or 0.0
            self._rho = _create_corr_matrix(rho, self._dtype)
            self._sqrt_rho = tf.linalg.cholesky(self._rho)

            def _vol_fn(t, state):
                """Volatility function of LSV model."""
                num_samples = state.shape.as_list()[0]
                broadcasted_t = tf.broadcast_to(t, [1, num_samples])
                spot_prices = state[:, 0]
                variance = state[:, 1:]
                level_fun = self._leverage_fn(
                    broadcasted_t, tf.expand_dims(spot_prices, axis=0))
                spot_diffusion = tf.expand_dims(
                    level_fun[0, :], axis=-1) * tf.expand_dims(
                        spot_prices, axis=-1) * tf.math.sqrt(variance)
                variance_diffusion = self._variance_process.volatility_fn()(
                    t, variance)
                diffusion = tf.concat([spot_diffusion, variance_diffusion],
                                      axis=1)
                diffusion = tf.expand_dims(diffusion, axis=-2)
                return diffusion * self._sqrt_rho

            # Drift function
            def _drift_fn(t, state):
                """Drift function of LSV model."""
                spot_drift = (self._risk_free_rate -
                              self._dividend_yield) * state[:, :1]
                variance_drift = self._variance_process.drift_fn()(t,
                                                                   state[:,
                                                                         1:])
                return tf.concat([spot_drift, variance_drift], axis=1)

            super(LocalStochasticVolatilityModel,
                  self).__init__(dim, _drift_fn, _vol_fn, self._dtype,
                                 self._name)
Ejemplo n.º 9
0
def expected_calibration_error(num_bins,
                               logits=None,
                               labels_true=None,
                               labels_predicted=None,
                               name=None):
    """Compute the Expected Calibration Error (ECE).

  This method implements equation (3) in [1].  In this equation the probability
  of the decided label being correct is used to estimate the calibration
  property of the predictor.

  Note: a trade-off exist between using a small number of `num_bins` and the
  estimation reliability of the ECE.  In particular, this method may produce
  unreliable ECE estimates in case there are few samples available in some bins.
  As an alternative to this method, consider also using
  `bayesian_expected_calibration_error`.

  #### References
  [1]: Chuan Guo, Geoff Pleiss, Yu Sun, Kilian Q. Weinberger,
       On Calibration of Modern Neural Networks.
       Proceedings of the 34th International Conference on Machine Learning
       (ICML 2017).
       arXiv:1706.04599
       https://arxiv.org/pdf/1706.04599.pdf

  Args:
    num_bins: int, number of probability bins, e.g. 10.
    logits: Tensor, (n,nlabels), with logits for n instances and nlabels.
    labels_true: Tensor, (n,), with tf.int32 or tf.int64 elements containing
      ground truth class labels in the range [0,nlabels].
    labels_predicted: Tensor, (n,), with tf.int32 or tf.int64 elements
      containing decisions of the predictive system.  If `None`, we will use
      the argmax decision using the `logits`.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    ece: Tensor, scalar, tf.float32.
  """
    with tf.name_scope(name or 'expected_calibration_error'):
        logits = tf.convert_to_tensor(logits)
        labels_true = tf.convert_to_tensor(labels_true)
        if labels_predicted is not None:
            labels_predicted = tf.convert_to_tensor(labels_predicted)

        # Compute empirical counts over the events defined by the sets
        # {incorrect,correct}x{0,1,..,num_bins-1}, as well as the empirical averages
        # of predicted probabilities in each probability bin.
        event_bin_counts, pmean_observed = _compute_calibration_bin_statistics(
            num_bins,
            logits=logits,
            labels_true=labels_true,
            labels_predicted=labels_predicted)

        # Compute the marginal probability of observing a probability bin.
        event_bin_counts = tf.cast(event_bin_counts, tf.float32)
        bin_n = tf.reduce_sum(event_bin_counts, axis=0)
        pbins = bin_n / tf.reduce_sum(
            bin_n)  # Compute the marginal bin probability

        # Compute the marginal probability of making a correct decision given an
        # observed probability bin.
        tiny = np.finfo(np.float32).tiny
        pcorrect = event_bin_counts[1, :] / (bin_n + tiny)

        # Compute the ECE statistic as defined in reference [1].
        ece = tf.reduce_sum(pbins * tf.abs(pcorrect - pmean_observed))

    return ece
Ejemplo n.º 10
0
    def from_volatility_surface(cls,
                                implied_volatility_surface,
                                variance_process,
                                initial_spot,
                                initial_variance,
                                rho=None,
                                risk_free_rate=None,
                                dividend_yield=None,
                                time_step=None,
                                num_grid_points=None,
                                grid_minimums=None,
                                grid_maximums=None,
                                dtype=None):
        """Creates a `LocalStochasticVolatilityModel` from volatility surface.

    This function computes the leverage function for the LSV model by first
    computing the joint probablity density function `p(t, X(t), v(t))` where
    `X(t)` is the log of the spot price and `v(t)` is the variance at time `t`.
    The joint probablity density is computed using the Fokker-Planck equation of
    the LSV model (see 6.8.2 in Ref [1]):
    ```None
    dp/dt = 1/2 d^2 [v L(t,X)^2 p]/dX^2 + 1/2 d^2 [b(v)^2 p]/dv^2 +
            rho d^2 [sqrt(v)L(t,X)b(v) p]/dXdv -
            d[(r - d - 1/2 v L(t,X)^2)p]/dX -
            d[a(v) p]/dv
    ```

    where `a(v)` and `b(v)` are the drift and diffusion functions for the
    variance process. Defining

    ```None
    I_n(k,t) = int v^n p(t, k, v) dv
    ```

    we can calculate the leverage function as follows:
    ```None
    L(k, t) = sigma(exp(k), t) sqrt(I_0(k, t)/I_1(k, t)).
    ```

    Args:
      implied_volatility_surface: Either an instance of
        `processed_market_data.VolatilitySurface` or a Python object containing
        the implied volatility market data. If the input is a Python object,
        then the object must implement a function `volatility(strike,
        expiry_times)` which takes real `Tensor`s corresponding to option
        strikes and time to expiry and returns a real `Tensor` containing the
        correspoding market implied volatility.
      variance_process: An instance of `LSVVarianceModel` or
        `ItoProcess`specifying the dynamics of the variance process of
        the LSV model.
      initial_spot: A real scalar `Tensor` specifying the underlying spot price
        on the valuation date.
      initial_variance: A real scalar `Tensor` specifying the initial variance
        on the valuation date.
      rho: A real scalar `Tensor` specifying the correlation between spot price
        and the stochastic variance.
      risk_free_rate: A real scalar `Tensor` specifying the (continuosly
        compounded) risk free interest rate. If the underlying is an FX rate,
        then use this input to specify the domestic interest rate.
        Default value: `None` in which case the input is set to zero.
      dividend_yield: A real scalar `Tensor` specifying the (continuosly
        compounded) divident yield. If the underlying is an FX rate, then use
        this input to specify the foreign interest rate.
        Default value: `None` in which case the input is set to zero.
      time_step: An optional real scalar `Tensor` specifying the time step
        during the numerical solution of the Fokker-Planck PDE.
        Default value: None, in which case `time_step` corresponding to 100 time
          steps is used.
      num_grid_points: A scalar integer `Tensor` specifying the number of
        discretization points for each spatial dimension.
        Default value: None, in which case number of grid points is set to 100.
      grid_minimums: An optional `Tensor` of size 2 containing the minimum grid
        points for PDE spatial discretization. `grid_minimums[0]` correspond
        to the minimum spot price in the spatial grid and `grid_minimums[1]`
        correspond to the minimum variance value.
      grid_maximums: An optional `Tensor` of size 2 containing the maximum grid
        points for PDE spatial discretization. `grid_maximums[0]` correspond
        to the maximum spot price in the spatial grid and `grid_maximums[1]`
        correspond to the maximum variance value.
      dtype: The default dtype to use when converting values to `Tensor`s.
        Default value: `None` which means that default dtypes inferred by
          TensorFlow are used.

    Returns:
      An instance of `LocalStochasticVolatilityModel` constructed using the
      input data.
    """

        if risk_free_rate is None:
            discount_factor_fn = lambda t: tf.ones_like(t, dtype=dtype)
        else:
            r = tf.convert_to_tensor(risk_free_rate, dtype=dtype)
            discount_factor_fn = lambda t: tf.math.exp(-r * t)
        lv_model = lvm.LocalVolatilityModel.from_volatility_surface(
            dim=1,
            spot=initial_spot,
            implied_volatility_surface=implied_volatility_surface,
            discount_factor_fn=discount_factor_fn,
            dividend_yield=dividend_yield,
            dtype=dtype)

        dtype = dtype or lv_model.dtype()
        day_count_fn = utils.get_daycount_fn(
            implied_volatility_surface.daycount_convention)
        max_time = tf.math.reduce_max(
            day_count_fn(
                start_date=implied_volatility_surface.settlement_date(),
                end_date=implied_volatility_surface.node_expiries()))
        if time_step is None:
            time_step = max_time / 100.0

        rho = rho or 0.0
        num_grid_points = num_grid_points or 100

        leverage_fn = _leverage_function_using_pde(
            risk_free_rate=risk_free_rate,
            dividend_yield=dividend_yield,
            lv_model=lv_model,
            variance_model=variance_process,
            rho=[rho],
            initial_spot=initial_spot,
            initial_variance=initial_variance,
            time_step=time_step,
            max_time=max_time,
            num_grid_points=num_grid_points,
            grid_minimums=grid_minimums,
            grid_maximums=grid_maximums,
            dtype=dtype)
        return LocalStochasticVolatilityModel(leverage_fn,
                                              variance_process,
                                              risk_free_rate=risk_free_rate,
                                              dividend_yield=dividend_yield,
                                              rho=rho,
                                              dtype=dtype)
Ejemplo n.º 11
0
def _leverage_function_using_pde(*, risk_free_rate, dividend_yield, lv_model,
                                 variance_model, rho, initial_spot,
                                 initial_variance, max_time, time_step,
                                 num_grid_points, grid_minimums, grid_maximums,
                                 dtype):
    """Computes Leverage function using Fokker-Planck PDE for joint density.

  This function computes the leverage function for the LSV model by first
  computing the joint probablity density function `p(t, X(t), v(t))` where
  `X(t)` is the log of the spot price and `v(t)` is the variance at time `t`.
  The joint probablity density is computed using the Fokker-Planck equation of
  the LSV model (see 6.8.2 in Ref [1]):
  ```None
  dp/dt = 1/2 d^2 [v L(t,X)^2 p]/dX^2 + 1/2 d^2 [b(v)^2 p]/dv^2 +
          rho d^2 [sqrt(v)L(t,X)b(v) p]/dXdv - d[(r - d - 1/2 v L(t,X)^2)p]/dX -
          d[a(v) p]/dv
  ```

  where `a(v)` and `b(v)` are the drift and diffusion functions for the
  variance process. Defining

  ```None
  I_n(k,t) = int v^n p(t, k, v) dv
  ```

  we can calculate the leverage function as follows:
  ```None
  L(k, t) = sigma(exp(k), t) sqrt(I_0(k, t)/I_1(k, t)).
  ```

  Args:
    risk_free_rate: A scalar real `Tensor` specifying the (continuosly
      compounded) risk free interest rate. If the underlying is an FX rate, then
      use this input to specify the domestic interest rate.
    dividend_yield: A real scalar `Tensor` specifying the (continuosly
      compounded) dividend yield. If the underlying is an FX rate, then use this
      input to specify the foreign interest rate.
    lv_model: An instance of `LocalVolatilityModel` specifying the local
      volatility for the spot price.
    variance_model: An instance of `LSVVarianceModel` specifying the dynamics of
      the variance process of the LSV model.
    rho: A real scalar `Tensor` specifying the correlation between spot price
      and the stochastic variance.
    initial_spot: A real scalar `Tensor` specifying the underlying spot price on
      the valuation date.
    initial_variance: A real scalar `Tensor` specifying the initial variance on
      the valuation date.
    max_time: A real scalar `Tensor` specifying the maximum time to which the
      Fokker-Planck PDE is evolved.
    time_step: A real scalar `Tensor` specifying the time step during the
      numerical solution of the Fokker-Planck PDE.
    num_grid_points: A scalar integer `Tensor` specifying the number of
      discretization points for each spatial dimension.
    grid_minimums: An optional `Tensor` of size 2 containing the minimum grid
      points for PDE spatial discretization. `grid_minimums[0]` correspond
      to the minimum spot price in the spatial grid and `grid_minimums[1]`
      correspond to the minimum variance value.
    grid_maximums: An optional `Tensor` of size 2 containing the maximum grid
      points for PDE spatial discretization. `grid_maximums[0]` correspond
      to the maximum spot price in the spatial grid and `grid_maximums[1]`
      correspond to the maximum variance value.
    dtype: The default dtype to use when converting values to `Tensor`s.

  Returns:
    A Python callable which computes the Leverage function `L(t, S(t))`. The
    function accepts a scalar `Tensor` corresponding to time 't' and a real
    `Tensor` of shape `[num_samples, 1]` corresponding to the spot price (S) as
    inputs  and return a real `Tensor` corresponding to the leverage function
    computed at (S,t).

  """
    if variance_model.dim() > 1:
        raise ValueError(
            "The default model of Leverage function doesn\'t support "
            "the variance process with more than 1 factor.")

    pde_grid_tol = _machine_eps(dtype)
    rho = tf.convert_to_tensor(rho, dtype=dtype)
    initial_spot = tf.convert_to_tensor(initial_spot, dtype=dtype)
    initial_log_spot = tf.math.log(
        tf.convert_to_tensor(initial_spot, dtype=dtype))
    initial_variance = tf.convert_to_tensor(initial_variance, dtype=dtype)
    risk_free_rate = tf.convert_to_tensor(risk_free_rate, dtype=dtype)
    dividend_yield = tf.convert_to_tensor(dividend_yield, dtype=dtype)
    rho = tf.convert_to_tensor(rho, dtype=dtype)

    x_scale = initial_log_spot
    y_scale = initial_variance
    # scaled log spot = log(spot/initial_spot)
    # scaled variance = variance / initial_variance
    scaled_initial_point = tf.convert_to_tensor([0.0, 1.0], dtype=dtype)

    # These are minimums and maximums for scaled log spot and scaled variance
    if grid_minimums is None:
        grid_minimums = [0.01, 0.0001]
    else:
        grid_minimums = tf.convert_to_tensor(grid_minimums, dtype=dtype)
        grid_minimums = [
            grid_minimums[0] / initial_spot,
            grid_minimums[1] / initial_variance
        ]
    if grid_maximums is None:
        grid_maximums = [10.0, 5.0]
    else:
        grid_maximums = tf.convert_to_tensor(grid_maximums, dtype=dtype)
        grid_maximums = [
            grid_maximums[0] / initial_spot,
            grid_maximums[1] / initial_variance
        ]

    log_spot_min = tf.math.log(
        tf.convert_to_tensor([grid_minimums[0]], dtype=dtype))
    log_spot_max = tf.math.log(
        tf.convert_to_tensor([grid_maximums[0]], dtype=dtype))
    variance_min = tf.convert_to_tensor([grid_minimums[1]], dtype=dtype)
    variance_max = tf.convert_to_tensor([grid_maximums[1]], dtype=dtype)

    grid_minimums = tf.concat([log_spot_min, variance_min], axis=0)
    grid_maximums = tf.concat([log_spot_max, variance_max], axis=0)

    grid = _tavella_randell_nonuniform_grid(grid_minimums, grid_maximums,
                                            scaled_initial_point,
                                            num_grid_points, 0.3, dtype)
    grid = [tf.expand_dims(grid[0], axis=0), tf.expand_dims(grid[1], axis=0)]

    delta_x = tf.math.reduce_min(grid[0][0, 1:] - grid[0][0, :-1])
    delta_y = tf.math.reduce_min(grid[1][0, 1:] - grid[1][0, :-1])
    # Initialize leverage function L(t=0, S) = 1
    leverage_fn = functools.partial(linear.interpolate,
                                    x_data=[[0.0, 1.0]],
                                    y_data=[[1.0, 1.0]],
                                    dtype=dtype)

    def _initial_value():
        """Computes initial value as a delta function delta(log_spot(t), var(0))."""
        log_spot, variance = tf.meshgrid(*grid)

        init_value = tf.where(
            tf.math.logical_and(
                tf.math.abs(log_spot - scaled_initial_point[0]) <
                delta_x + pde_grid_tol,
                tf.math.abs(variance - scaled_initial_point[1]) <
                delta_y + pde_grid_tol), 1.0 / (delta_x * delta_y * 4), 0.0)
        # initial_value.shape = (1, num_grid_x, num_grid_y)
        return tf.expand_dims(init_value, axis=0)

    def _second_order_coeff_fn(t, grid):
        log_spot = grid[0] + x_scale
        variance = grid[1] * y_scale
        leverage_fn_t_x = leverage_fn(log_spot)
        val_xx = 0.5 * variance * leverage_fn_t_x**2
        val_xy = 0.5 * (rho * tf.math.sqrt(variance) * leverage_fn_t_x *
                        variance_model.volatility_fn()(t, variance)) / y_scale
        val_yx = val_xy
        val_yy = 0.5 * variance_model.volatility_fn()(t,
                                                      variance)**2 / y_scale**2
        # return list of shape = (2,2). Each element has shape = grid.shape
        return [[-val_yy, -val_yx], [-val_xy, -val_xx]]

    def _first_order_coeff_fn(t, grid):
        log_spot = grid[0] + x_scale
        variance = grid[1] * y_scale
        leverage_fn_t_x = leverage_fn(log_spot)
        val_x = (risk_free_rate - dividend_yield -
                 0.5 * variance * leverage_fn_t_x**2)
        val_y = variance_model.drift_fn()(t, variance)
        # return list of shape = (2,). Each element has shape = grid.shape
        return [val_y / y_scale, val_x]

    def _compute_leverage_fn(t, coord_grid, value_grid):
        log_spot = tf.expand_dims(coord_grid[0], axis=-1) + x_scale
        local_volatility_values = lv_model.local_volatility_fn()(
            t, tf.math.exp(log_spot))
        # TODO(b/176826650): Large values represent instability. Eventually this
        # should be addressed inside local vol model.
        local_volatility_values = tf.where(
            tf.math.abs(local_volatility_values) > 1e4, 0.0,
            local_volatility_values)
        # variance_given_logspot.shape = (num_grid_x, 1)
        variance_given_logspot = _conditional_expected_variance_from_pde_solution(
            [coord_grid[0] + x_scale, coord_grid[1] * y_scale], value_grid,
            dtype)(log_spot)

        leverage_fn_values = tf.math.divide_no_nan(
            local_volatility_values, tf.math.sqrt(variance_given_logspot))

        leverage_fn = functools.partial(
            linear.interpolate,
            x_data=grid[0] + x_scale,
            y_data=tf.transpose(leverage_fn_values),
            dtype=dtype)

        return leverage_fn

    @pde.boundary_conditions.neumann
    def _trivial_neumann_boundary(t, location_grid):
        del t, location_grid
        return 0.0

    leverage_fn_values = []
    leverage_fn_values.append(leverage_fn(grid[0][0])[0])
    # joint_density.shape = (1, num_grid_x, num_grid_y)
    joint_density = _initial_value()

    for tstart in np.arange(0.0, max_time, time_step):
        joint_density, coord_grid, _, _ = pde.fd_solvers.solve_forward(
            tstart,
            tstart + time_step,
            coord_grid=[grid[0][0], grid[1][0]],
            values_grid=joint_density,
            time_step=time_step / 10.0,
            values_transform_fn=None,
            inner_second_order_coeff_fn=_second_order_coeff_fn,
            inner_first_order_coeff_fn=_first_order_coeff_fn,
            zeroth_order_coeff_fn=None,
            boundary_conditions=[[
                _trivial_neumann_boundary, _trivial_neumann_boundary
            ], [_trivial_neumann_boundary, _trivial_neumann_boundary]],
            dtype=dtype)
        joint_density = tf.math.maximum(joint_density, 0.0)
        area_under_joint_density = _two_d_integration(
            [grid[0][0, :], grid[1][0, :]], joint_density)
        joint_density = joint_density / area_under_joint_density

        # TODO(b/176826743): Perform fixed point iteration instead of one step
        # update
        leverage_fn = _compute_leverage_fn(
            tf.convert_to_tensor(tstart + time_step), coord_grid,
            joint_density)
        leverage_fn_values.append(leverage_fn(grid[0][0, :] + x_scale)[0, :])

    # leverage_fn_values.shape = (num_pde_timesteps, num_grid_x,)
    leverage_fn_values = tf.convert_to_tensor(leverage_fn_values, dtype=dtype)
    times = tf.range(0.0, max_time + time_step, time_step, dtype=dtype)

    def _return_fn(t, spot):
        leverage_fn_interpolator = (
            math.interpolation.interpolation_2d.Interpolation2D(
                x_data=[times],
                y_data=tf.expand_dims(tf.repeat(grid[0] + x_scale,
                                                times.shape[0],
                                                axis=0),
                                      axis=0),
                z_data=tf.expand_dims(leverage_fn_values, axis=0),
                dtype=dtype))
        return leverage_fn_interpolator.interpolate(t, tf.math.log(spot))

    return _return_fn
Ejemplo n.º 12
0
def update(value_and_gradients_function,
           val_left,
           val_right,
           val_trial,
           f_lim,
           active=None):
    """Squeezes a bracketing interval containing the minimum.

  Given an interval which brackets a minimum and a point in that interval,
  finds a smaller nested interval which also brackets the minimum. If the
  supplied point does not lie in the bracketing interval, the current interval
  is returned.

  The following description is given in terms of individual points evaluated on
  a line function to be minimized. Note, however, the implementation also
  accepts batches of points allowing to minimize multiple line functions at
  once. See details on the docstring of `value_and_gradients_function` below.

  The requirement of the interval bracketing a minimum is expressed through the
  opposite slope conditions. Assume the left end point is 'a', the right
  end point is 'b', the function to be minimized is 'f' and the derivative is
  'df'. The update procedure relies on the following conditions being satisfied:

  '''
    f(a) <= f(0) + epsilon   (1)
    df(a) < 0                (2)
    df(b) > 0                (3)
  '''

  In the first condition, epsilon is a small positive constant. The condition
  demands that the function at the left end point be not much bigger than the
  starting point (i.e. 0). This is an easy to satisfy condition because by
  assumption, we are in a direction where the function value is decreasing.
  The second and third conditions together demand that there is at least one
  zero of the derivative in between a and b.

  In addition to the interval, the update algorithm requires a third point to
  be supplied. Usually, this point would lie within the interval [a, b]. If the
  point is outside this interval, the current interval is returned. If the
  point lies within the interval, the behaviour of the function and derivative
  value at this point is used to squeeze the original interval in a manner that
  preserves the opposite slope conditions.

  For further details of this component, see the procedure U0-U3 on page 123 of
  the [Hager and Zhang (2006)][2] article.

  Note that this function does not explicitly verify whether the opposite slope
  conditions are satisfied for the supplied interval. It is assumed that this
  is so.

  Args:
    value_and_gradients_function: A Python callable that accepts a real scalar
      tensor and returns an object that can be converted to a namedtuple.
      The namedtuple should have fields 'f' and 'df' that correspond to scalar
      tensors of real dtype containing the value of the function and its
      derivative at that point. The other namedtuple fields, if present,
      should be tensors or sequences (possibly nested) of tensors.
      In usual optimization application, this function would be generated by
      projecting the multivariate objective function along some specific
      direction. The direction is determined by some other procedure but should
      be a descent direction (i.e. the derivative of the projected univariate
      function must be negative at 0.).
      Alternatively, the function may represent the batching of `n` such line
      functions (e.g. projecting a single multivariate objective function along
      `n` distinct directions at once) accepting n points as input, i.e. a
      tensor of shape [n], and the fields 'f' and 'df' in the returned
      namedtuple should each be a tensor of shape [n], with the corresponding
      function values and derivatives at the input points.
    val_left: Return value of value_and_gradients_function at the left
      end point of the bracketing interval (labelles 'a' above).
    val_right: Return value of value_and_gradients_function at the right
      end point of the bracketing interval (labelles 'b' above).
    val_trial: Return value of value_and_gradients_function at the trial point
      to be used to shrink the interval (labelled 'c' above).
    f_lim: real `Tensor` of shape [n]. The function value threshold for
      the approximate Wolfe conditions to be checked for each batch member.
    active: optional boolean `Tensor` of shape [n]. Relevant in batching mode
      only, indicates batch members on which the update procedure should be
      applied. On non-active members the current left/right interval is returned
      unmodified.

  Returns:
    A namedtuple containing the following fields:
      iteration: An int32 scalar `Tensor`. The number of iterations performed
        by the bisect algorithm.
      stopped: A boolean `Tensor` of shape [n]. True for those batch members
        where the bisection algorithm terminated.
      failed: A boolean `Tensor` of shape [n]. True for those batch members
        where an error was encountered.
      num_evals: An int32 scalar `Tensor`. The number of times the objective
        function was evaluated.
      left: Return value of value_and_gradients_function at the updated left
        end point of the interval found.
      right: Return value of value_and_gradients_function at the updated right
        end point of the interval found.
  """
    # We should only update if the trial point is within the interval.
    within_range = (val_left.x < val_trial.x) & (val_trial.x < val_right.x)
    if active is not None:
        within_range = within_range & active

    # The new point is a valid left end point if it has negative slope
    # and the value at the point is not too large.
    valid_left = (val_trial.df < 0) & (val_trial.f <= f_lim)

    # If the trial point has a negative slope but the value at that point
    # is too high, bisect can narrow down an interval between the current left
    # and the trial point.
    needs_bisect = within_range & (val_trial.df < 0) & (val_trial.f > f_lim)

    # Note that if `~valid_left` it is because either:
    # - the slope at the trial point is positive, so it is a valid right
    #   point, or
    # - the needs_bisect condition is true.
    # In both cases we want to keep the current left and replace right
    # with the trial point.
    left = val_where(within_range & valid_left, val_trial, val_left)
    right = val_where(within_range & ~valid_left, val_trial, val_right)

    bisect_args = _IntermediateResult(
        iteration=tf.convert_to_tensor(0),
        stopped=~needs_bisect,
        failed=tf.zeros_like(within_range),  # i.e. all false.
        num_evals=tf.convert_to_tensor(0),
        left=left,
        right=right)
    return _bisect(value_and_gradients_function, bisect_args, f_lim)
Ejemplo n.º 13
0
def minimize(objective_function,
             initial_simplex=None,
             initial_vertex=None,
             step_sizes=None,
             objective_at_initial_simplex=None,
             objective_at_initial_vertex=None,
             batch_evaluate_objective=False,
             func_tolerance=1e-8,
             position_tolerance=1e-8,
             parallel_iterations=1,
             max_iterations=None,
             reflection=None,
             expansion=None,
             contraction=None,
             shrinkage=None,
             name=None):
    """Minimum of the objective function using the Nelder Mead simplex algorithm.

  Performs an unconstrained minimization of a (possibly non-smooth) function
  using the Nelder Mead simplex method. Nelder Mead method does not support
  univariate functions. Hence the dimensions of the domain must be 2 or greater.
  For details of the algorithm, see
  [Press, Teukolsky, Vetterling and Flannery(2007)][1].

  Points in the domain of the objective function may be represented as a
  `Tensor` of general shape but with rank at least 1. The algorithm proceeds
  by modifying a full rank simplex in the domain. The initial simplex may
  either be specified by the user or can be constructed using a single vertex
  supplied by the user. In the latter case, if `v0` is the supplied vertex,
  the simplex is the convex hull of the set:

  ```None
  S = {v0} + {v0 + step_i * e_i}
  ```

  Here `e_i` is a vector which is `1` along the `i`-th axis and zero elsewhere
  and `step_i` is a characteristic length scale along the `i`-th axis. If the
  step size is not supplied by the user, a unit step size is used in every axis.
  Alternately, a single step size may be specified which is used for every
  axis. The most flexible option is to supply a bespoke step size for every
  axis.

  ### Usage:

  The following example demonstrates the usage of the Nelder Mead minimzation
  on a two dimensional problem with the minimum located at a non-differentiable
  point.

  ```python
    # The objective function
    def sqrt_quadratic(x):
      return tf.sqrt(tf.reduce_sum(x ** 2, axis=-1))

    start = tf.constant([6.0, -21.0])  # Starting point for the search.
    optim_results = tfp.optimizer.nelder_mead_minimize(
        sqrt_quadratic, initial_vertex=start, func_tolerance=1e-8,
        batch_evaluate_objective=True)

    # Check that the search converged
    assert(optim_results.converged)
    # Check that the argmin is close to the actual value.
    np.testing.assert_allclose(optim_results.position, np.array([0.0, 0.0]),
                                atol=1e-7)
    # Print out the total number of function evaluations it took.
    print("Function evaluations: %d" % optim_results.num_objective_evaluations)
  ```

  ### References:
  [1]: William Press, Saul Teukolsky, William Vetterling and Brian Flannery.
    Numerical Recipes in C++, third edition. pp. 502-507. (2007).
    http://numerical.recipes/cpppages/chap0sel.pdf

  [2]: Jeffrey Lagarias, James Reeds, Margaret Wright and Paul Wright.
    Convergence properties of the Nelder-Mead simplex method in low dimensions,
    Siam J. Optim., Vol 9, No. 1, pp. 112-147. (1998).
    http://www.math.kent.edu/~reichel/courses/Opt/reading.material.2/nelder.mead.pdf

  [3]: Fuchang Gao and Lixing Han. Implementing the Nelder-Mead simplex
    algorithm with adaptive parameters. Computational Optimization and
    Applications, Vol 51, Issue 1, pp 259-277. (2012).
    https://pdfs.semanticscholar.org/15b4/c4aa7437df4d032c6ee6ce98d6030dd627be.pdf

  Args:
    objective_function:  A Python callable that accepts a point as a
      real `Tensor` and returns a `Tensor` of real dtype containing
      the value of the function at that point. The function
      to be minimized. If `batch_evaluate_objective` is `True`, the callable
      may be evaluated on a `Tensor` of shape `[n+1] + s ` where `n` is
      the dimension of the problem and `s` is the shape of a single point
      in the domain (so `n` is the size of a `Tensor` representing a
      single point).
      In this case, the expected return value is a `Tensor` of shape `[n+1]`.
      Note that this method does not support univariate functions so the problem
      dimension `n` must be strictly greater than 1.
    initial_simplex: (Optional) `Tensor` of real dtype. The initial simplex to
      start the search. If supplied, should be a `Tensor` of shape `[n+1] + s`
      where `n` is the dimension of the problem and `s` is the shape of a
      single point in the domain. Each row (i.e. the `Tensor` with a given
      value of the first index) is interpreted as a vertex of a simplex and
      hence the rows must be affinely independent. If not supplied, an axes
      aligned simplex is constructed using the `initial_vertex` and
      `step_sizes`. Only one and at least one of `initial_simplex` and
      `initial_vertex` must be supplied.
    initial_vertex: (Optional) `Tensor` of real dtype and any shape that can
      be consumed by the `objective_function`. A single point in the domain that
      will be used to construct an axes aligned initial simplex.
    step_sizes: (Optional) `Tensor` of real dtype and shape broadcasting
      compatible with `initial_vertex`. Supplies the simplex scale along each
      axes. Only used if `initial_simplex` is not supplied. See description
      above for details on how step sizes and initial vertex are used to
      construct the initial simplex.
    objective_at_initial_simplex: (Optional) Rank `1` `Tensor` of real dtype
      of a rank `1` `Tensor`. The value of the objective function at the
      initial simplex. May be supplied only if `initial_simplex` is
      supplied. If not supplied, it will be computed.
    objective_at_initial_vertex: (Optional) Scalar `Tensor` of real dtype. The
      value of the objective function at the initial vertex. May be supplied
      only if the `initial_vertex` is also supplied.
    batch_evaluate_objective: (Optional) Python `bool`. If True, the objective
      function will be evaluated on all the vertices of the simplex packed
      into a single tensor. If False, the objective will be mapped across each
      vertex separately. Evaluating the objective function in a batch allows
      use of vectorization and should be preferred if the objective function
      allows it.
    func_tolerance: (Optional) Scalar `Tensor` of real dtype. The algorithm
      stops if the absolute difference between the largest and the smallest
      function value on the vertices of the simplex is below this number.
    position_tolerance: (Optional) Scalar `Tensor` of real dtype. The
      algorithm stops if the largest absolute difference between the
      coordinates of the vertices is below this threshold.
    parallel_iterations: (Optional) Positive integer. The number of iterations
      allowed to run in parallel.
    max_iterations: (Optional) Scalar positive `Tensor` of dtype `int32`.
      The maximum number of iterations allowed. If `None` then no limit is
      applied.
    reflection: (Optional) Positive Scalar `Tensor` of same dtype as
      `initial_vertex`. This parameter controls the scaling of the reflected
      vertex. See, [Press et al(2007)][1] for details. If not specified,
      uses the dimension dependent prescription of [Gao and Han(2012)][3].
    expansion: (Optional) Positive Scalar `Tensor` of same dtype as
      `initial_vertex`. Should be greater than `1` and `reflection`. This
      parameter controls the expanded scaling of a reflected vertex.
      See, [Press et al(2007)][1] for details. If not specified, uses the
      dimension dependent prescription of [Gao and Han(2012)][3].
    contraction: (Optional) Positive scalar `Tensor` of same dtype as
      `initial_vertex`. Must be between `0` and `1`. This parameter controls
      the contraction of the reflected vertex when the objective function at
      the reflected point fails to show sufficient decrease.
      See, [Press et al(2007)][1] for more details. If not specified, uses
      the dimension dependent prescription of [Gao and Han(2012][3].
    shrinkage: (Optional) Positive scalar `Tensor` of same dtype as
      `initial_vertex`. Must be between `0` and `1`. This parameter is the scale
      by which the simplex is shrunk around the best point when the other
      steps fail to produce improvements.
      See, [Press et al(2007)][1] for more details. If not specified, uses
      the dimension dependent prescription of [Gao and Han(2012][3].
    name: (Optional) Python str. The name prefixed to the ops created by this
      function. If not supplied, the default name 'minimize' is used.

  Returns:
    optimizer_results: A namedtuple containing the following items:
      converged: Scalar boolean tensor indicating whether the minimum was
        found within tolerance.
      num_objective_evaluations: The total number of objective
        evaluations performed.
      position: A `Tensor` containing the last argument value found
        during the search. If the search converged, then
        this value is the argmin of the objective function.
      objective_value: A tensor containing the value of the objective
        function at the `position`. If the search
        converged, then this is the (local) minimum of
        the objective function.
      final_simplex: The last simplex constructed before stopping.
      final_objective_values: The objective function evaluated at the
        vertices of the final simplex.
      initial_simplex: The starting simplex.
      initial_objective_values: The objective function evaluated at the
        vertices of the initial simplex.
      num_iterations: The number of iterations of the main algorithm body.

  Raises:
    ValueError: If any of the following conditions hold
      1. If none or more than one of `initial_simplex` and `initial_vertex` are
        supplied.
      2. If `initial_simplex` and `step_sizes` are both specified.
  """
    with tf1.name_scope(name, 'minimize', [
            initial_simplex, initial_vertex, step_sizes,
            objective_at_initial_simplex, objective_at_initial_vertex,
            func_tolerance, position_tolerance
    ]):
        (dim, _, simplex, objective_at_simplex,
         num_evaluations) = _prepare_args(objective_function, initial_simplex,
                                          initial_vertex, step_sizes,
                                          objective_at_initial_simplex,
                                          objective_at_initial_vertex,
                                          batch_evaluate_objective)
        domain_dtype = simplex.dtype
        (reflection, expansion, contraction,
         shrinkage) = _resolve_parameters(dim, reflection, expansion,
                                          contraction, shrinkage, domain_dtype)

        closure_kwargs = dict(
            objective_function=objective_function,
            dim=dim,
            func_tolerance=func_tolerance,
            position_tolerance=position_tolerance,
            batch_evaluate_objective=batch_evaluate_objective,
            reflection=reflection,
            expansion=expansion,
            contraction=contraction,
            shrinkage=shrinkage)

        def _loop_body(_, iterations, simplex, objective_at_simplex,
                       num_evaluations):
            (converged, next_simplex, next_objective,
             evaluations) = nelder_mead_one_step(simplex, objective_at_simplex,
                                                 **closure_kwargs)

            return (converged, iterations + 1, next_simplex, next_objective,
                    num_evaluations + evaluations)

        initial_args = (False, 0, simplex, objective_at_simplex,
                        num_evaluations)

        # Loop until either we have converged or if the max iterations are supplied
        # then until we have converged or exhausted the available iteration budget.
        def _is_converged(converged, num_iterations, *ignored_args):  # pylint:disable=unused-argument
            # It is important to ensure that not_converged is a tensor. If
            # converged is not a tensor but a Python bool, then the overloaded
            # op '~' acts as bitwise complement so ~True = -2 and ~False = -1.
            # In that case, the loop will never terminate.
            not_converged = tf.logical_not(converged)
            return (not_converged if max_iterations is None else
                    (not_converged & (num_iterations < max_iterations)))

        (converged, num_iterations, final_simplex, final_objective_values,
         final_evaluations) = tf.while_loop(
             cond=_is_converged,
             body=_loop_body,
             loop_vars=initial_args,
             parallel_iterations=parallel_iterations)
        order = tf.argsort(final_objective_values,
                           direction='ASCENDING',
                           stable=True)
        best_index = order[0]
        # The explicit cast to Tensor below is done to avoid returning a mixture
        # of Python types and Tensors which cause problems with session.run.
        # In the eager mode, converged may remain a Python bool. Trying to evaluate
        # the whole tuple in one evaluate call will raise an exception because
        # of the presence of non-tensors. This is very annoying so we explicitly
        # cast those arguments to Tensors.
        return NelderMeadOptimizerResults(
            converged=tf.convert_to_tensor(value=converged),
            num_objective_evaluations=final_evaluations,
            position=final_simplex[best_index],
            objective_value=final_objective_values[best_index],
            final_simplex=final_simplex,
            final_objective_values=final_objective_values,
            num_iterations=tf.convert_to_tensor(value=num_iterations),
            initial_simplex=simplex,
            initial_objective_values=objective_at_simplex)
Ejemplo n.º 14
0
 def __init__(self, logits):
   self._logits = tf.convert_to_tensor(logits)
   super(StatefulCategorical, self).__init__(
       dtype=tf.int32, reparameterization_type=tfd.NOT_REPARAMETERIZED,
       validate_args=False, allow_nan_stats=False)
Ejemplo n.º 15
0
 def _logits_parameter_no_checks(self):
     if self._logits is None:
         probs = tf.convert_to_tensor(self._probs)
         return tf.math.log(probs) - tf.math.log1p(-probs)
     return tf.identity(self._logits)
Ejemplo n.º 16
0
def brier_decomposition(labels, logits, name=None):
    r"""Decompose the Brier score into uncertainty, resolution, and reliability.

  [Proper scoring rules][1] measure the quality of probabilistic predictions;
  any proper scoring rule admits a [unique decomposition][2] as
  `Score = Uncertainty - Resolution + Reliability`, where:

  * `Uncertainty`, is a generalized entropy of the average predictive
    distribution; it can both be positive or negative.
  * `Resolution`, is a generalized variance of individual predictive
    distributions; it is always non-negative.  Difference in predictions reveal
    information, that is why a larger resolution improves the predictive score.
  * `Reliability`, a measure of calibration of predictions against the true
    frequency of events.  It is always non-negative and a lower value here
    indicates better calibration.

  This method estimates the above decomposition for the case of the Brier
  scoring rule for discrete outcomes.  For this, we need to discretize the space
  of probability distributions; we choose a simple partition of the space into
  `nlabels` events: given a distribution `p` over `nlabels` outcomes, the index
  `k` for which `p_k > p_i` for all `i != k` determines the discretization
  outcome; that is, `p in M_k`, where `M_k` is the set of all distributions for
  which `p_k` is the largest value among all probabilities.

  The estimation error of each component is O(k/n), where n is the number
  of instances and k is the number of labels.  There may be an error of this
  order when compared to `brier_score`.

  #### References
  [1]: Tilmann Gneiting, Adrian E. Raftery.
       Strictly Proper Scoring Rules, Prediction, and Estimation.
       Journal of the American Statistical Association, Vol. 102, 2007.
       https://www.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf
  [2]: Jochen Broecker.  Reliability, sufficiency, and the decomposition of
       proper scores.
       Quarterly Journal of the Royal Meteorological Society, Vol. 135, 2009.
       https://rmets.onlinelibrary.wiley.com/doi/epdf/10.1002/qj.456

  Args:
    labels: Tensor, (n,), with tf.int32 or tf.int64 elements containing ground
      truth class labels in the range [0,nlabels].
    logits: Tensor, (n, nlabels), with logits for n instances and nlabels.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    uncertainty: Tensor, scalar, the uncertainty component of the
      decomposition.
    resolution: Tensor, scalar, the resolution component of the decomposition.
    reliability: Tensor, scalar, the reliability component of the
      decomposition.
  """
    with tf.name_scope(name or 'brier_decomposition'):
        labels = tf.convert_to_tensor(labels)
        logits = tf.convert_to_tensor(logits)
        num_classes = logits.shape[-1]

        # Compute pbar, the average distribution
        pred_class = tf.argmax(logits, axis=-1, output_type=labels.dtype)

        if tensorshape_util.rank(logits.shape) > 2:
            flatten, unflatten = _make_flatten_unflatten_fns(logits.shape[:-2])

            def fn_to_map(args):
                yhat, y = args
                return tf.math.confusion_matrix(yhat,
                                                y,
                                                num_classes=num_classes,
                                                dtype=logits.dtype)

            confusion_matrix = tf.map_fn(
                fn_to_map,
                [flatten(pred_class), flatten(labels)],
                dtype=logits.dtype)
            confusion_matrix = unflatten(confusion_matrix)
        else:
            confusion_matrix = tf.math.confusion_matrix(
                pred_class,
                labels,
                num_classes=num_classes,
                dtype=logits.dtype)

        dist_weights = tf.reduce_sum(confusion_matrix, axis=-1)
        dist_weights /= tf.reduce_sum(dist_weights, axis=-1, keepdims=True)
        pbar = tf.reduce_sum(confusion_matrix, axis=-2)
        pbar /= tf.reduce_sum(pbar, axis=-1, keepdims=True)

        eps = np.finfo(dtype_util.as_numpy_dtype(confusion_matrix.dtype)).eps
        # dist_mean[k,:] contains the empirical distribution for the set M_k
        # Some outcomes may not realize, corresponding to dist_weights[k] = 0
        dist_mean = confusion_matrix / (
            eps + tf.reduce_sum(confusion_matrix, axis=-1, keepdims=True))

        # Uncertainty: quadratic entropy of the average label distribution
        uncertainty = -tf.reduce_sum(tf.square(pbar), axis=-1)

        # Resolution: expected quadratic divergence of predictive to mean
        resolution = tf.square(tf.expand_dims(pbar, -1) - dist_mean)
        resolution = tf.reduce_sum(dist_weights *
                                   tf.reduce_sum(resolution, axis=-1),
                                   axis=-1)

        # Reliability: expected quadratic divergence of predictive to true
        if tensorshape_util.rank(logits.shape) > 2:
            # TODO(b/139094519): Avoid using tf.map_fn here.
            prob_true = tf.map_fn(
                lambda args: tf.gather(args[0], args[1]),
                [flatten(dist_mean), flatten(pred_class)],
                dtype=dist_mean.dtype)
            prob_true = unflatten(prob_true)
        else:
            prob_true = tf.gather(dist_mean, pred_class, axis=0)
        log_prob_true = tf.math.log(prob_true)

        log_prob_pred = logits - tf.math.reduce_logsumexp(
            logits, axis=-1, keepdims=True)

        log_reliability = _reduce_log_l2_exp(log_prob_pred,
                                             log_prob_true,
                                             axis=-1)
        log_reliability = tf.math.reduce_logsumexp(
            log_reliability,
            axis=-1,
        )

        num_samples = tf.cast(tf.shape(logits)[-2], logits.dtype)
        reliability = tf.exp(log_reliability - tf.math.log(num_samples))

        return uncertainty, resolution, reliability
Ejemplo n.º 17
0
 def _fn(*fargs, **fkwargs):
     d = fn(*fargs, **fkwargs)
     x = tf.convert_to_tensor(d)
     d.shape = x.shape
     d.get_shape = x.get_shape
     return d, x
Ejemplo n.º 18
0
def find_root_chandrupatla(objective_fn,
                           low,
                           high,
                           position_tolerance=1e-8,
                           value_tolerance=0.,
                           max_iterations=50,
                           stopping_policy_fn=tf.reduce_all,
                           validate_args=False,
                           name='find_root_chandrupatla'):
    r"""Finds root(s) of a scalar function using Chandrupatla's method.

  Chandrupatla's method [1, 2] is a root-finding algorithm that is guaranteed
  to converge if a root lies within the given bounds. It generalizes the
  [bisection method](https://en.wikipedia.org/wiki/Bisection_method); at each
  step it chooses to perform either bisection or inverse quadratic
  interpolation. This makes it similar in spirit to [Brent's method](
  https://en.wikipedia.org/wiki/Brent%27s_method), which also considers steps
  that use the secant method, but Chandrupatla's method is simpler and often
  converges at least as quickly [3].

  Args:
    objective_fn: Python callable for which roots are searched. It must be a
      callable of a single variable. `objective_fn` must return a `Tensor` with
      shape `batch_shape` and dtype matching `lower_bound` and `upper_bound`.
    low: Float `Tensor` of shape `batch_shape` representing a lower
      bound(s) on the value of a root(s).
    high: Float `Tensor` of shape `batch_shape` representing an upper
      bound(s) on the value of a root(s).
    position_tolerance: Optional `Tensor` representing the maximum absolute
      error in the positions of the estimated roots. Shape must broadcast with
      `batch_shape`.
      Default value: `1e-8`.
    value_tolerance: Optional `Tensor` representing the absolute error allowed
      in the value of the objective function. If the absolute value of
      `objective_fn` is smaller than
      `value_tolerance` at a given position, then that position is considered a
      root for the function. Shape must broadcast with `batch_shape`.
      Default value: `1e-8`.
    max_iterations: Optional `Tensor` or Python integer specifying the maximum
      number of steps to perform. Shape must broadcast with `batch_shape`.
      Default value: `50`.
    stopping_policy_fn: Python `callable` controlling the algorithm termination.
      It must be a callable accepting a `Tensor` of booleans with the same shape
      as `lower_bound` and `upper_bound` (denoting whether each search is
      finished), and returning a scalar boolean `Tensor` indicating
      whether the overall search should stop. Typical values are
      `tf.reduce_all` (which returns only when the search is finished for all
      points), and `tf.reduce_any` (which returns as soon as the search is
      finished for any point).
      Default value: `tf.reduce_all` (returns only when the search is finished
        for all points).
    validate_args: Python `bool` indicating whether to validate arguments.
      Default value: `False`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: 'find_root_chandrupatla'.

  Returns:
    root_search_results: A Python `namedtuple` containing the following items:
      estimated_root: `Tensor` containing the last position explored. If the
        search was successful within the specified tolerance, this position is
        a root of the objective function.
      objective_at_estimated_root: `Tensor` containing the value of the
        objective function at `position`. If the search was successful within
        the specified tolerance, then this is close to 0.
      num_iterations: The number of iterations performed.

  #### References

  [1] Tirupathi R. Chandrupatla. A new hybrid quadratic/bisection algorithm for
      finding the zero of a nonlinear function without using derivatives.
      _Advances in Engineering Software_, 28.3:145-149, 1997.
  [2] Philipp OJ Scherer. Computational Physics. _Springer Berlin_,
      Heidelberg, 2010.
      Section 6.1.7.3 https://books.google.com/books?id=cC-8BAAAQBAJ&pg=PA95
  [3] Jason Sachs. Ten Little Algorithms, Part 5: Quadratic Extremum
      Interpolation and Chandrupatla's Method (2015).
      https://www.embeddedrelated.com/showarticle/855.php
  """

    ################################################
    # Loop variables used by Chandrupatla's method:
    #
    #  a: endpoint of an interval `[min(a, b), max(a, b)]` containing the
    #     root. There is no guarantee as to which of `a` and `b` is larger.
    #  b: endpoint of an interval `[min(a, b), max(a, b)]` containing the
    #       root. There is no guarantee as to which of `a` and `b` is larger.
    #  f_a: value of the objective at `a`.
    #  f_b: value of the objective at `b`.
    #  t: the next position to be evaluated as the coefficient of a convex
    #    combination of `a` and `b` (i.e., a value in the unit interval).
    #  num_iterations: integer number of steps taken so far.
    #  converged: boolean indicating whether each batch element has converged.
    #
    # All variables have the same shape `batch_shape`.

    def _should_continue(a, b, f_a, f_b, t, num_iterations, converged):
        del a, b, f_a, f_b, t  # Unused.
        all_converged = stopping_policy_fn(
            tf.logical_or(converged, num_iterations >= max_iterations))
        return ~all_converged

    def _body(a, b, f_a, f_b, t, num_iterations, converged):
        """One step of Chandrupatla's method for root finding."""
        previous_loop_vars = (a, b, f_a, f_b, t, num_iterations, converged)
        finalized_elements = tf.logical_or(converged,
                                           num_iterations >= max_iterations)

        # Evaluate the new point.
        x_new = (1 - t) * a + t * b
        f_new = objective_fn(x_new)
        # If we've bisected (t==0.5) and the new float value for `a` is identical to
        # that from the previous iteration, then we'll keep bisecting (the
        # logic below will set t==0.5 for the next step), and nothing further will
        # change.
        at_fixed_point = tf.equal(x_new, a) & tf.equal(t, 0.5)
        # Otherwise, tighten the bounds.
        a, b, c, f_a, f_b, f_c = _structure_broadcasting_where(
            tf.equal(tf.math.sign(f_new), tf.math.sign(f_a)),
            (x_new, b, a, f_new, f_b, f_a), (x_new, a, b, f_new, f_a, f_b))

        # Check for convergence.
        f_best = tf.where(tf.abs(f_a) < tf.abs(f_b), f_a, f_b)
        interval_tolerance = position_tolerance / (tf.abs(b - c))
        converged = tf.logical_or(
            interval_tolerance > 0.5,
            tf.logical_or(
                tf.math.abs(f_best) <= value_tolerance, at_fixed_point))

        # Propose next point to evaluate.
        xi = (a - b) / (c - b)
        phi = (f_a - f_b) / (f_c - f_b)
        t = tf.where(
            # Condition for inverse quadratic interpolation.
            tf.logical_and(1 - tf.math.sqrt(1 - xi) < phi,
                           tf.math.sqrt(xi) > phi),
            # Propose a point by inverse quadratic interpolation.
            (f_a / (f_b - f_a) * f_c / (f_b - f_c) + (c - a) / (b - a) * f_a /
             (f_c - f_a) * f_b / (f_c - f_b)),
            # Otherwise, just cut the interval in half (bisection).
            0.5)
        # Constrain the proposal to the current interval (0 < t < 1).
        t = tf.minimum(tf.maximum(t, interval_tolerance),
                       1 - interval_tolerance)

        # Update elements that haven't converged.
        return _structure_broadcasting_where(
            finalized_elements, previous_loop_vars,
            (a, b, f_a, f_b, t, num_iterations + 1, converged))

    with tf.name_scope(name):
        max_iterations = tf.convert_to_tensor(max_iterations,
                                              name='max_iterations',
                                              dtype_hint=tf.int32)
        a = tf.convert_to_tensor(low, name='lower_bound')
        b = tf.convert_to_tensor(high, name='upper_bound')
        f_a, f_b = objective_fn(a), objective_fn(b)
        batch_shape = ps.broadcast_shape(ps.shape(f_a), ps.shape(f_b))

        assertions = []
        if validate_args:
            assertions += [
                assert_util.assert_none_equal(
                    tf.math.sign(f_a),
                    tf.math.sign(f_b),
                    message='Bounds must be on different sides of a root.')
            ]

        with tf.control_dependencies(assertions):
            initial_loop_vars = [
                a, b, f_a, f_b,
                tf.cast(0.5, dtype=f_a.dtype),
                tf.cast(0, dtype=max_iterations.dtype), False
            ]
            a, b, f_a, f_b, _, num_iterations, _ = tf.while_loop(
                _should_continue,
                _body,
                loop_vars=tf.nest.map_structure(
                    lambda x: tf.broadcast_to(x, batch_shape),
                    initial_loop_vars))

        x_best, f_best = _structure_broadcasting_where(
            tf.abs(f_a) < tf.abs(f_b), (a, f_a), (b, f_b))
    return RootSearchResults(estimated_root=x_best,
                             objective_at_estimated_root=f_best,
                             num_iterations=num_iterations)
Ejemplo n.º 19
0
    def test_enables_nontensor_plumbing(self):
        if tf.executing_eagerly():
            self.skipTest('`compile` functionality changed.')
        # Setup.

        class Foo:
            def __init__(self, input_):
                self._input = input_
                self.value = tf.convert_to_tensor([[42.]])

            @property
            def dtype(self):
                return self.value.dtype

        tf.register_tensor_conversion_function(
            Foo, lambda x, *args, **kwargs: x.value)
        tf_utils.register_symbolic_tensor_type(Foo)

        class PlumbingLayer(keras.layers.Lambda):
            def __init__(self, fn, **kwargs):
                def _fn(*fargs, **fkwargs):
                    d = fn(*fargs, **fkwargs)
                    x = tf.convert_to_tensor(d)
                    d.shape = x.shape
                    d.get_shape = x.get_shape
                    return d, x

                super(PlumbingLayer, self).__init__(_fn, **kwargs)
                self._enter_dunder_call = False

            def __call__(self, inputs, *args, **kwargs):
                self._enter_dunder_call = True
                d, _ = super(PlumbingLayer,
                             self).__call__(inputs, *args, **kwargs)
                self._enter_dunder_call = False
                return d

            def call(self, inputs, *args, **kwargs):
                d, v = super(PlumbingLayer, self).call(inputs, *args, **kwargs)
                if self._enter_dunder_call:
                    return d, v
                return d

        # User-land.
        model = keras.Sequential([
            keras.layers.InputLayer((1, )),
            PlumbingLayer(Foo),  # Makes a `Foo` object.
        ])
        # Let's ensure Keras graph history is preserved by composing the models.
        model = keras.Model(model.inputs, model(model.outputs))
        # Now we instantiate the model and verify we have a `Foo` object, not a
        # `Tensor`.
        y = model(tf.convert_to_tensor([[7.]]))
        self.assertIsInstance(y, Foo)
        # Confirm that (custom) loss sees `Foo` instance, not Tensor.
        obtained_prediction_box = [None]

        def custom_loss(y_obs, y_pred):
            del y_obs
            obtained_prediction_box[0] = y_pred
            return y_pred

        # Apparently `compile` calls the loss function enough to trigger the
        # side-effect.
        model.compile('SGD', loss=custom_loss)
        self.assertIsInstance(obtained_prediction_box[0], Foo)
Ejemplo n.º 20
0
def find_root_secant(objective_fn,
                     initial_position,
                     next_position=None,
                     value_at_position=None,
                     position_tolerance=1e-8,
                     value_tolerance=1e-8,
                     max_iterations=50,
                     stopping_policy_fn=tf.reduce_all,
                     validate_args=False,
                     name=None):
    r"""Finds root(s) of a function of single variable using the secant method.

  The [secant method](https://en.wikipedia.org/wiki/Secant_method) is a
  root-finding algorithm that uses a succession of roots of secant lines to
  better approximate a root of a function. The secant method can be thought of
  as a finite-difference approximation of Newton's method.

  Args:
    objective_fn: Python callable for which roots are searched. It must be a
      callable of a single variable. `objective_fn` must return a `Tensor` of
      the same shape and dtype as `initial_position`.
    initial_position: `Tensor` or Python float representing the starting
      position. The function will search for roots in the neighborhood of each
      point. The shape of `initial_position` should match that of the input to
      `objective_fn`.
    next_position: Optional `Tensor` representing the next position in the
      search. If specified, this argument must broadcast with the shape of
      `initial_position` and have the same dtype. It will be used to compute the
      first step to take when searching for roots. If not specified, a default
      value will be used instead.
      Default value: `initial_position * (1 + 1e-4) + sign(initial_position) *
        1e-4`.
    value_at_position: Optional `Tensor` or Python float representing the value
      of `objective_fn` at `initial_position`. If specified, this argument must
      have the same shape and dtype as `initial_position`. If not specified, the
      value will be evaluated during the search.
      Default value: None.
    position_tolerance: Optional `Tensor` representing the tolerance for the
      estimated roots. If specified, this argument must broadcast with the shape
      of `initial_position` and have the same dtype.
      Default value: `1e-8`.
    value_tolerance: Optional `Tensor` representing the tolerance used to check
      for roots. If the absolute value of `objective_fn` is smaller than
      `value_tolerance` at a given position, then that position is considered a
      root for the function. If specified, this argument must broadcast with the
      shape of `initial_position` and have the same dtype.
      Default value: `1e-8`.
    max_iterations: Optional `Tensor` or Python integer specifying the maximum
      number of steps to perform for each initial position. Must broadcast with
      the shape of `initial_position`.
      Default value: `50`.
    stopping_policy_fn: Python `callable` controlling the algorithm termination.
      It must be a callable accepting a `Tensor` of booleans with the shape of
      `initial_position` (each denoting whether the search is finished for each
      starting point), and returning a scalar boolean `Tensor` (indicating
      whether the overall search should stop). Typical values are
      `tf.reduce_all` (which returns only when the search is finished for all
      points), and `tf.reduce_any` (which returns as soon as the search is
      finished for any point).
      Default value: `tf.reduce_all` (returns only when the search is finished
        for all points).
    validate_args: Python `bool` indicating whether to validate arguments such
      as `position_tolerance`, `value_tolerance`, and `max_iterations`.
      Default value: `False`.
    name: Python `str` name prefixed to ops created by this function.

  Returns:
    root_search_results: A Python `namedtuple` containing the following items:
      estimated_root: `Tensor` containing the last position explored. If the
        search was successful within the specified tolerance, this position is
        a root of the objective function.
      objective_at_estimated_root: `Tensor` containing the value of the
        objective function at `position`. If the search was successful within
        the specified tolerance, then this is close to 0.
      num_iterations: The number of iterations performed.

  Raises:
    ValueError: if a non-callable `stopping_policy_fn` is passed.

  #### Examples

  ```python
  import tensorflow as tf
  import tensorflow_probability as tfp
  tf.enable_eager_execution()

  # Example 1: Roots of a single function from two different starting points.

  f = lambda x: (63 * x**5 - 70 * x**3 + 15 * x) / 8.
  x = tf.constant([-1, 10], dtype=tf.float64)

  tfp.math.secant_root(objective_fn=f, initial_position=x))
  # ==> RootSearchResults(
      estimated_root=array([-0.90617985, 0.90617985]),
      objective_at_estimated_root=array([-4.81727769e-10, 7.44957651e-10]),
      num_iterations=array([ 7, 24], dtype=int32))

  tfp.math.secant_root(objective_fn=f,
                       initial_position=x,
                       stopping_policy_fn=tf.reduce_any)
  # ==> RootSearchResults(
      estimated_root=array([-0.90617985, 3.27379206]),
      objective_at_estimated_root=array([-4.81727769e-10, 2.66058312e+03]),
      num_iterations=array([7, 8], dtype=int32))

  # Example 2: Roots of a multiplex function from a single starting point.

  def f(x):
    return tf.constant([0., 63. / 8], dtype=tf.float64) * x**5 \
        + tf.constant([5. / 2, -70. / 8], dtype=tf.float64) * x**3 \
        + tf.constant([-3. / 2, 15. / 8], dtype=tf.float64) * x

  x = tf.constant([-1, -1], dtype=tf.float64)

  tfp.math.secant_root(objective_fn=f, initial_position=x)
  # ==> RootSearchResults(
      estimated_root=array([-0.77459667, -0.90617985]),
      objective_at_estimated_root=array([-7.81339438e-11, -4.81727769e-10]),
      num_iterations=array([7, 7], dtype=int32))

  # Example 3: Roots of a multiplex function from two starting points.

  def f(x):
    return tf.constant([0., 63. / 8], dtype=tf.float64) * x**5 \
        + tf.constant([5. / 2, -70. / 8], dtype=tf.float64) * x**3 \
        + tf.constant([-3. / 2, 15. / 8], dtype=tf.float64) * x

  x = tf.constant([[-1, -1], [10, 10]], dtype=tf.float64)

  tfp.math.secant_root(objective_fn=f, initial_position=x)
  # ==> RootSearchResults(
      estimated_root=array([
          [-0.77459667, -0.90617985],
          [ 0.77459667, 0.90617985]]),
      objective_at_estimated_root=array([
          [-7.81339438e-11, -4.81727769e-10],
          [6.66025013e-11, 7.44957651e-10]]),
      num_iterations=array([
          [7, 7],
          [16, 24]], dtype=int32))
  ```
  """
    if not callable(stopping_policy_fn):
        raise ValueError('stopping_policy_fn must be callable')

    position = tf.convert_to_tensor(
        initial_position,
        name='position',
    )
    value_at_position = tf.convert_to_tensor(
        value_at_position or objective_fn(position),
        name='value_at_position',
        dtype=dtype_util.base_dtype(position.dtype))

    zero = tf.zeros_like(position)
    position_tolerance = tf.convert_to_tensor(position_tolerance,
                                              name='position_tolerance',
                                              dtype=position.dtype)
    value_tolerance = tf.convert_to_tensor(value_tolerance,
                                           name='value_tolerance',
                                           dtype=position.dtype)

    num_iterations = tf.zeros_like(position, dtype=tf.int32)
    max_iterations = tf.convert_to_tensor(max_iterations, dtype=tf.int32)
    max_iterations = tf.broadcast_to(max_iterations,
                                     name='max_iterations',
                                     shape=position.shape)

    # Compute the step from `next_position` if present. This covers the case where
    # a user has two starting points, which bound the root or has a specific step
    # size in mind.
    if next_position is None:
        epsilon = tf.constant(1e-4, dtype=position.dtype, shape=position.shape)
        step = position * epsilon + tf.sign(position) * epsilon
    else:
        step = next_position - initial_position

    finished = tf.constant(False, shape=position.shape)

    # Negate `stopping_condition` to determine if the search should continue.
    # This means, in particular, that tf.reduce_*all* will return only when the
    # search is finished for *all* starting points.
    def _should_continue(position, value_at_position, num_iterations, step,
                         finished):
        """Indicates whether the overall search should continue.

    Args:
      position: `Tensor` containing the current root estimates.
      value_at_position: `Tensor` containing the value of `objective_fn` at
        `position`.
      num_iterations: `Tensor` containing the current iteration index for each
        point.
      step: `Tensor` containing the size of the step to take for each point.
      finished: `Tensor` indicating for which points the search is finished.

    Returns:
      A boolean value indicating whether the overall search should continue.
    """
        del position, value_at_position, num_iterations, step  # Unused
        return ~tf.convert_to_tensor(
            stopping_policy_fn(finished), name='should_stop', dtype=tf.bool)

    # For each point in `position`, the search is stopped if either:
    # (1) A root has been found
    # (2) f(position) == f(position + step)
    # (3) The maximum number of iterations has been reached
    # In case (2), the search may be stopped both before the desired tolerance is
    # achieved (or even a root is found), and the maximum number of iterations is
    # reached.
    def _body(position, value_at_position, num_iterations, step, finished):
        """Performs one iteration of the secant root-finding algorithm.

    Args:
      position: `Tensor` containing the current root estimates.
      value_at_position: `Tensor` containing the value of `objective_fn` at
        `position`.
      num_iterations: `Tensor` containing the current iteration index for each
        point.
      step: `Tensor` containing the size of the step to take for each point.
      finished: `Tensor` indicating for which points the search is finished.

    Returns:
      The `Tensor`s to use for the next iteration of the algorithm.
    """

        # True if the search was already finished, or (1) or (3) just became true.
        was_finished = finished | (num_iterations >= max_iterations) | (
            tf.abs(step) < position_tolerance) | (tf.abs(value_at_position) <
                                                  value_tolerance)

        # Compute the next position and the value at that point.
        next_position = tf.where(was_finished, position, position + step)
        value_at_next_position = tf.where(was_finished, value_at_position,
                                          objective_fn(next_position))

        # True if the search was already finished, or (2) just became true.
        is_finished = tf.equal(value_at_position, value_at_next_position)

        # Use the mid-point between the last two positions if (2) just became true.
        next_position = tf.where(is_finished & ~was_finished,
                                 (position + next_position) * 0.5,
                                 next_position)

        # Once finished, stop updating the iteration index and set the step to zero.
        num_iterations = tf.where(is_finished, num_iterations,
                                  num_iterations + 1)
        next_step = tf.where(
            is_finished, zero, step * value_at_next_position /
            (value_at_position - value_at_next_position))

        return (next_position, value_at_next_position, num_iterations,
                next_step, is_finished)

    with tf.name_scope(name or 'find_root_secant'):

        assertions = []
        if validate_args:
            assertions += [
                tf.debugging.assert_greater(
                    position_tolerance,
                    zero,
                    message='`position_tolerance` must be greater than 0.'),
                tf.debugging.assert_greater(
                    value_tolerance,
                    zero,
                    message='`value_tolerance` must be greater than 0.'),
                tf.debugging.assert_greater_equal(
                    max_iterations,
                    num_iterations,
                    message='`max_iterations` must be nonnegative.')
            ]

        with tf.control_dependencies(assertions):
            root, value_at_root, num_iterations, _, _ = tf.while_loop(
                cond=_should_continue,
                body=_body,
                loop_vars=(position, value_at_position, num_iterations, step,
                           finished))

    return RootSearchResults(estimated_root=root,
                             objective_at_estimated_root=value_at_root,
                             num_iterations=num_iterations)
    def __init__(self,
                 inner_kernel,
                 num_adaptation_steps,
                 target_accept_prob=0.75,
                 adaptation_rate=0.01,
                 step_size_setter_fn=hmc_like_step_size_setter_fn,
                 step_size_getter_fn=hmc_like_step_size_getter_fn,
                 log_accept_prob_getter_fn=hmc_like_log_accept_prob_getter_fn,
                 validate_args=False,
                 name=None):
        """Creates the step size adaptation kernel.

    The default setter_fn and the getter_fn callbacks assume that the inner
    kernel produces kernel results structurally the same as the
    `HamiltonianMonteCarlo` kernel.

    Args:
      inner_kernel: `TransitionKernel`-like object.
      num_adaptation_steps: Scalar `int` `Tensor` number of initial steps to
        during which to adjust the step size. This may be greater, less than, or
        equal to the number of burnin steps.
      target_accept_prob: A floating point `Tensor` representing desired
        acceptance probability. Must be a positive number less than 1. This can
        either be a scalar, or have shape [num_chains]. Default value: `0.75`
        (the [center of asymptotically optimal rate for HMC][1]).
      adaptation_rate: `Tensor` representing amount to scale the current
        `step_size`.
      step_size_setter_fn: A callable with the signature
        `(kernel_results, new_step_size) -> new_kernel_results` where
        `kernel_results` are the results of the `inner_kernel`, `new_step_size`
        is a `Tensor` or a nested collection of `Tensor`s with the same
        structure as returned by the `step_size_getter_fn`, and
        `new_kernel_results` are a copy of `kernel_results` with the step
        size(s) set.
      step_size_getter_fn: A callable with the signature
        `(kernel_results) -> step_size` where `kernel_results` are the results
        of the `inner_kernel`, and `step_size` is a floating point `Tensor` or a
        nested collection of such `Tensor`s.
      log_accept_prob_getter_fn: A callable with the signature
        `(kernel_results) -> log_accept_prob` where `kernel_results` are the
        results of the `inner_kernel`, and `log_accept_prob` is a floating point
        `Tensor`. `log_accept_prob` can either be a scalar, or have shape
        [num_chains]. If it's the latter, `step_size` should also have the same
        leading dimension.
      validate_args: Python `bool`. When `True` kernel parameters are checked
        for validity. When `False` invalid inputs may silently render incorrect
        outputs.
      name: Python `str` name prefixed to Ops created by this class. Default:
        'simple_step_size_adaptation'.

    #### References

    [1]: Betancourt, M. J., Byrne, S., & Girolami, M. (2014). _Optimizing The
         Integrator Step Size for Hamiltonian Monte Carlo_.
         http://arxiv.org/abs/1411.6669
    """

        inner_kernel = mcmc_util.enable_store_parameters_in_results(
            inner_kernel)

        with tf.name_scope(
                mcmc_util.make_name(name, 'simple_step_size_adaptation',
                                    '__init__')) as name:
            dtype = dtype_util.common_dtype(
                [target_accept_prob, adaptation_rate], tf.float32)
            target_accept_prob = tf.convert_to_tensor(
                target_accept_prob, dtype=dtype, name='target_accept_prob')
            adaptation_rate = tf.convert_to_tensor(adaptation_rate,
                                                   dtype=dtype,
                                                   name='adaptation_rate')
            num_adaptation_steps = tf.convert_to_tensor(
                num_adaptation_steps,
                dtype=tf.int32,
                name='num_adaptation_steps')

            target_accept_prob = _maybe_validate_target_accept_prob(
                target_accept_prob, validate_args)

        self._parameters = dict(
            inner_kernel=inner_kernel,
            num_adaptation_steps=num_adaptation_steps,
            target_accept_prob=target_accept_prob,
            adaptation_rate=adaptation_rate,
            step_size_setter_fn=step_size_setter_fn,
            step_size_getter_fn=step_size_getter_fn,
            log_accept_prob_getter_fn=log_accept_prob_getter_fn,
            name=name,
        )
Ejemplo n.º 22
0
    def _sample_n(self, n, seed=None):
        if self._use_static_graph:
            # This sampling approach is almost the same as the approach used by
            # `MixtureSameFamily`. The differences are due to having a list of
            # `Distribution` objects rather than a single object, and maintaining
            # random seed management that is consistent with the non-static code
            # path.
            samples = []
            cat_samples = self.cat.sample(n, seed=seed)
            stream = SeedStream(seed, salt='Mixture')

            for c in range(self.num_components):
                samples.append(self.components[c].sample(n, seed=stream()))
            stack_axis = -1 - tensorshape_util.rank(self._static_event_shape)
            x = tf.stack(samples, axis=stack_axis)  # [n, B, k, E]
            npdt = dtype_util.as_numpy_dtype(x.dtype)
            mask = tf.one_hot(
                indices=cat_samples,  # [n, B]
                depth=self._num_components,  # == k
                on_value=npdt(1),
                off_value=npdt(0))  # [n, B, k]
            mask = distribution_util.pad_mixture_dimensions(
                mask, self, self._cat,
                tensorshape_util.rank(
                    self._static_event_shape))  # [n, B, k, [1]*e]
            return tf.reduce_sum(x * mask, axis=stack_axis)  # [n, B, E]

        n = tf.convert_to_tensor(n, name='n')
        static_n = tf.get_static_value(n)
        n = int(static_n) if static_n is not None else n
        cat_samples = self.cat.sample(n, seed=seed)

        static_samples_shape = cat_samples.shape
        if tensorshape_util.is_fully_defined(static_samples_shape):
            samples_shape = tensorshape_util.as_list(static_samples_shape)
            samples_size = tensorshape_util.num_elements(static_samples_shape)
        else:
            samples_shape = tf.shape(cat_samples)
            samples_size = tf.size(cat_samples)
        static_batch_shape = self.batch_shape
        if tensorshape_util.is_fully_defined(static_batch_shape):
            batch_shape = tensorshape_util.as_list(static_batch_shape)
            batch_size = tensorshape_util.num_elements(static_batch_shape)
        else:
            batch_shape = tf.shape(cat_samples)[1:]
            batch_size = tf.reduce_prod(batch_shape)
        static_event_shape = self.event_shape
        if tensorshape_util.is_fully_defined(static_event_shape):
            event_shape = np.array(
                tensorshape_util.as_list(static_event_shape), dtype=np.int32)
        else:
            event_shape = None

        # Get indices into the raw cat sampling tensor. We will
        # need these to stitch sample values back out after sampling
        # within the component partitions.
        samples_raw_indices = tf.reshape(tf.range(0, samples_size),
                                         samples_shape)

        # Partition the raw indices so that we can use
        # dynamic_stitch later to reconstruct the samples from the
        # known partitions.
        partitioned_samples_indices = tf.dynamic_partition(
            data=samples_raw_indices,
            partitions=cat_samples,
            num_partitions=self.num_components)

        # Copy the batch indices n times, as we will need to know
        # these to pull out the appropriate rows within the
        # component partitions.
        batch_raw_indices = tf.reshape(tf.tile(tf.range(0, batch_size), [n]),
                                       samples_shape)

        # Explanation of the dynamic partitioning below:
        #   batch indices are i.e., [0, 1, 0, 1, 0, 1]
        # Suppose partitions are:
        #     [1 1 0 0 1 1]
        # After partitioning, batch indices are cut as:
        #     [batch_indices[x] for x in 2, 3]
        #     [batch_indices[x] for x in 0, 1, 4, 5]
        # i.e.
        #     [1 1] and [0 0 0 0]
        # Now we sample n=2 from part 0 and n=4 from part 1.
        # For part 0 we want samples from batch entries 1, 1 (samples 0, 1),
        # and for part 1 we want samples from batch entries 0, 0, 0, 0
        #   (samples 0, 1, 2, 3).
        partitioned_batch_indices = tf.dynamic_partition(
            data=batch_raw_indices,
            partitions=cat_samples,
            num_partitions=self.num_components)
        samples_class = [None for _ in range(self.num_components)]

        stream = SeedStream(seed, salt='Mixture')

        for c in range(self.num_components):
            n_class = tf.size(partitioned_samples_indices[c])
            samples_class_c = self.components[c].sample(n_class, seed=stream())

            if event_shape is None:
                batch_ndims = prefer_static.rank_from_shape(batch_shape)
                event_shape = tf.shape(samples_class_c)[1 + batch_ndims:]

            # Pull out the correct batch entries from each index.
            # To do this, we may have to flatten the batch shape.

            # For sample s, batch element b of component c, we get the
            # partitioned batch indices from
            # partitioned_batch_indices[c]; and shift each element by
            # the sample index. The final lookup can be thought of as
            # a matrix gather along locations (s, b) in
            # samples_class_c where the n_class rows correspond to
            # samples within this component and the batch_size columns
            # correspond to batch elements within the component.
            #
            # Thus the lookup index is
            #   lookup[c, i] = batch_size * s[i] + b[c, i]
            # for i = 0 ... n_class[c] - 1.
            lookup_partitioned_batch_indices = (
                batch_size * tf.range(n_class) + partitioned_batch_indices[c])
            samples_class_c = tf.reshape(
                samples_class_c,
                tf.concat([[n_class * batch_size], event_shape], 0))
            samples_class_c = tf.gather(samples_class_c,
                                        lookup_partitioned_batch_indices,
                                        name='samples_class_c_gather')
            samples_class[c] = samples_class_c

        # Stitch back together the samples across the components.
        lhs_flat_ret = tf.dynamic_stitch(indices=partitioned_samples_indices,
                                         data=samples_class)
        # Reshape back to proper sample, batch, and event shape.
        ret = tf.reshape(lhs_flat_ret,
                         tf.concat([samples_shape, event_shape], 0))
        tensorshape_util.set_shape(
            ret,
            tensorshape_util.concatenate(static_samples_shape,
                                         self.event_shape))
        return ret
Ejemplo n.º 23
0
def cholesky_covariance(x, sample_axis=0, keepdims=False, name=None):
    """Cholesky factor of the covariance matrix of vector-variate random samples.

  This function can be use to fit a multivariate normal to data.

  ```python
  tf.enable_eager_execution()
  import tensorflow_probability as tfp
  tfd = tfp.distributions

  # Assume data.shape = (1000, 2).  1000 samples of a random variable in R^2.
  observed_data = read_data_samples(...)

  # The mean is easy
  mu = tf.reduce_mean(observed_data, axis=0)

  # Get the scale matrix
  L = tfp.stats.cholesky_covariance(observed_data)

  # Make the best fit multivariate normal (under maximum likelihood condition).
  mvn = tfd.MultivariateNormalTriL(loc=mu, scale_tril=L)

  # Plot contours of the pdf.
  xs, ys = tf.meshgrid(
      tf.linspace(-5., 5., 50), tf.linspace(-5., 5., 50), indexing='ij')
  xy = tf.stack((tf.reshape(xs, [-1]), tf.reshape(ys, [-1])), axis=-1)
  pdf = tf.reshape(mvn.prob(xy), (50, 50))
  CS = plt.contour(xs, ys, pdf, 10)
  plt.clabel(CS, inline=1, fontsize=10)
  ```

  Why does this work?
  Given vector-variate random variables `X = (X1, ..., Xd)`, one may obtain the
  sample covariance matrix in `R^{d x d}` (see `tfp.stats.covariance`).

  The [Cholesky factor](https://en.wikipedia.org/wiki/Cholesky_decomposition)
  of this matrix is analogous to standard deviation for scalar random variables:
  Suppose `X` has covariance matrix `C`, with Cholesky factorization `C = L L^T`
  Then multiplying a vector of iid random variables which have unit variance by
  `L` produces a vector with covariance `L L^T`, which is the same as `X`.

  ```python
  observed_data = read_data_samples(...)
  L = tfp.stats.cholesky_covariance(observed_data, sample_axis=0)

  # Make fake_data with the same covariance as observed_data.
  uncorrelated_normal = tf.random.normal(shape=(500, 10))
  fake_data = tf.linalg.matvec(L, uncorrelated_normal)
  ```

  Args:
    x:  Numeric `Tensor`.  The rightmost dimension of `x` indexes events. E.g.
      dimensions of a random vector.
    sample_axis: Scalar or vector `Tensor` designating axis holding samples.
      Default value: `0` (leftmost dimension). Cannot be the rightmost dimension
        (since this indexes events).
    keepdims:  Boolean.  Whether to keep the sample axis as singletons.
    name: Python `str` name prefixed to Ops created by this function.
          Default value: `None` (i.e., `'covariance'`).

  Returns:
    chol:  `Tensor` of same `dtype` as `x`.  The last two dimensions hold
      lower triangular matrices (the Cholesky factors).
  """
    with tf.name_scope(name or 'cholesky_covariance'):
        sample_axis = tf.convert_to_tensor(sample_axis, dtype=tf.int32)
        cov = covariance(x,
                         sample_axis=sample_axis,
                         event_axis=-1,
                         keepdims=keepdims)
        return tf.linalg.cholesky(cov)
Ejemplo n.º 24
0
 def _log_prob_with_logcdf(self, y):
     low = None if self._low is None else tf.convert_to_tensor(self._low)
     high = None if self._high is None else tf.convert_to_tensor(self._high)
     return _logsum_expbig_minus_expsmall(
         self.log_cdf(y, low=low, high=high),
         self.log_cdf(y - 1, low=low, high=high))
Ejemplo n.º 25
0
def auto_correlation(x,
                     axis=-1,
                     max_lags=None,
                     center=True,
                     normalize=True,
                     name='auto_correlation'):
    """Auto correlation along one axis.

  Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation
  `RXX` may be defined as  (with `E` expectation and `Conj` complex conjugate)

  ```
  RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) },
  W[n]   := (X[n] - MU) / S,
  MU     := E{ X[0] },
  S**2   := E{ (X[0] - MU) Conj(X[0] - MU) }.
  ```

  This function takes the viewpoint that `x` is (along one axis) a finite
  sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an
  estimate of `RXX[m]` as follows:

  After extending `x` from length `L` to `inf` by zero padding, the auto
  correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as

  ```
  rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]),
  w[n]   := (x[n] - mu) / s,
  mu     := L**-1 sum_n x[n],
  s**2   := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu)
  ```

  The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users
  often set `max_lags` small enough so that the entire output is meaningful.

  Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by
  `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation
  contains a slight bias, which goes to zero as `len(x) - m --> infinity`.

  Args:
    x:  `float32` or `complex64` `Tensor`.
    axis:  Python `int`. The axis number along which to compute correlation.
      Other dimensions index different batch members.
    max_lags:  Positive `int` tensor.  The maximum value of `m` to consider (in
      equation above).  If `max_lags >= x.shape[axis]`, we effectively re-set
      `max_lags` to `x.shape[axis] - 1`.
    center:  Python `bool`.  If `False`, do not subtract the mean estimate `mu`
      from `x[n]` when forming `w[n]`.
    normalize:  Python `bool`.  If `False`, do not divide by the variance
      estimate `s**2` when forming `w[n]`.
    name:  `String` name to prepend to created ops.

  Returns:
    `rxx`: `Tensor` of same `dtype` as `x`.  `rxx.shape[i] = x.shape[i]` for
      `i != axis`, and `rxx.shape[axis] = max_lags + 1`.

  Raises:
    TypeError:  If `x` is not a supported type.
  """
    # Implementation details:
    # Extend length N / 2 1-D array x to length N by zero padding onto the end.
    # Then, set
    #   F[x]_k := sum_n x_n exp{-i 2 pi k n / N }.
    # It is not hard to see that
    #   F[x]_k Conj(F[x]_k) = F[R]_k, where
    #   R_m := sum_n x_n Conj(x_{(n - m) mod N}).
    # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m].

    # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT
    # based version of estimating RXX.
    # Note that this is a special case of the Wiener-Khinchin Theorem.
    with tf.name_scope(name):
        x = tf.convert_to_tensor(x, name='x')

        # Rotate dimensions of x in order to put axis at the rightmost dim.
        # FFT op requires this.
        rank = ps.rank(x)
        if axis < 0:
            axis = rank + axis
        shift = rank - 1 - axis
        # Suppose x.shape[axis] = T, so there are T 'time' steps.
        #   ==> x_rotated.shape = B + [T],
        # where B is x_rotated's batch shape.
        x_rotated = distribution_util.rotate_transpose(x, shift)

        if center:
            x_rotated = x_rotated - tf.reduce_mean(
                x_rotated, axis=-1, keepdims=True)

        # x_len = N / 2 from above explanation.  The length of x along axis.
        # Get a value for x_len that works in all cases.
        x_len = ps.shape(x_rotated)[-1]

        # TODO(langmore) Investigate whether this zero padding helps or hurts.  At
        # the moment is necessary so that all FFT implementations work.
        # Zero pad to the next power of 2 greater than 2 * x_len, which equals
        # 2**(ceil(Log_2(2 * x_len))).  Note: Log_2(X) = Log_e(X) / Log_e(2).
        x_len_float64 = ps.cast(x_len, np.float64)
        target_length = ps.pow(np.float64(2.),
                               ps.ceil(ps.log(x_len_float64 * 2) / np.log(2.)))
        pad_length = ps.cast(target_length - x_len_float64, np.int32)

        # We should have:
        # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length]
        #                     = B + [T + pad_length]
        x_rotated_pad = distribution_util.pad(x_rotated,
                                              axis=-1,
                                              back=True,
                                              count=pad_length)

        dtype = x.dtype
        if not dtype_util.is_complex(dtype):
            if not dtype_util.is_floating(dtype):
                raise TypeError(
                    'Argument x must have either float or complex dtype'
                    ' found: {}'.format(dtype))
            x_rotated_pad = tf.complex(
                x_rotated_pad,
                dtype_util.as_numpy_dtype(dtype_util.real_dtype(dtype))(0.))

        # Autocorrelation is IFFT of power-spectral density (up to some scaling).
        fft_x_rotated_pad = tf.signal.fft(x_rotated_pad)
        spectral_density = fft_x_rotated_pad * tf.math.conj(fft_x_rotated_pad)
        # shifted_product is R[m] from above detailed explanation.
        # It is the inner product sum_n X[n] * Conj(X[n - m]).
        shifted_product = tf.signal.ifft(spectral_density)

        # Cast back to real-valued if x was real to begin with.
        shifted_product = tf.cast(shifted_product, dtype)

        # Figure out if we can deduce the final static shape, and set max_lags.
        # Use x_rotated as a reference, because it has the time dimension in the far
        # right, and was created before we performed all sorts of crazy shape
        # manipulations.
        know_static_shape = True
        if not tensorshape_util.is_fully_defined(x_rotated.shape):
            know_static_shape = False
        if max_lags is None:
            max_lags = x_len - 1
        else:
            max_lags = tf.convert_to_tensor(max_lags, name='max_lags')
            max_lags_ = tf.get_static_value(max_lags)
            if max_lags_ is None or not know_static_shape:
                know_static_shape = False
                max_lags = tf.minimum(x_len - 1, max_lags)
            else:
                max_lags = min(x_len - 1, max_lags_)

        # Chop off the padding.
        # We allow users to provide a huge max_lags, but cut it off here.
        # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags]
        shifted_product_chopped = shifted_product[..., :max_lags + 1]

        # If possible, set shape.
        if know_static_shape:
            chopped_shape = tensorshape_util.as_list(x_rotated.shape)
            chopped_shape[-1] = min(x_len, max_lags + 1)
            tensorshape_util.set_shape(shifted_product_chopped, chopped_shape)

        # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]).  The
        # other terms were zeros arising only due to zero padding.
        # `denominator = (N / 2 - m)` (defined below) is the proper term to
        # divide by to make this an unbiased estimate of the expectation
        # E[X[n] Conj(X[n - m])].
        x_len = ps.cast(x_len, dtype_util.real_dtype(dtype))
        max_lags = ps.cast(max_lags, dtype_util.real_dtype(dtype))
        denominator = x_len - ps.range(0., max_lags + 1.)
        denominator = ps.cast(denominator, dtype)
        shifted_product_rotated = shifted_product_chopped / denominator

        if normalize:
            shifted_product_rotated /= shifted_product_rotated[..., :1]

        # Transpose dimensions back to those of x.
        return distribution_util.rotate_transpose(shifted_product_rotated,
                                                  -shift)
Ejemplo n.º 26
0
 def _prob_with_cdf(self, y):
     low = None if self._low is None else tf.convert_to_tensor(self._low)
     high = None if self._low is None else tf.convert_to_tensor(self._low)
     return self._cdf(y, low=low, high=high) - self._cdf(
         y - 1, low=low, high=high)
Ejemplo n.º 27
0
def sqrt_with_finite_grads(x, name=None):
    """A sqrt function whose gradient at zero is very large but finite.

  Args:
    x: a `Tensor` whose sqrt is to be computed.
    name: a Python `str` prefixed to all ops created by this function.
      Default `None` (i.e., "sqrt_with_finite_grads").

  Returns:
    sqrt: the square root of `x`, with an overridden gradient at zero
    grad: a gradient function, which is the same as sqrt's gradient everywhere
      except at zero, where it is given a large finite value, instead of `inf`.

  Raises:
    TypeError: if `tf.convert_to_tensor(x)` is not a `float` type.

  Often in kernel functions, we need to compute the L2 norm of the difference
  between two vectors, `x` and `y`: `sqrt(sum_i((x_i - y_i) ** 2))`. In the
  case where `x` and `y` are identical, e.g., on the diagonal of a kernel
  matrix, we get `NaN`s when we take gradients with respect to the inputs. To
  see, this consider the forward pass:

    ```
    [x_1 ... x_N]  -->  [x_1 ** 2 ... x_N ** 2]  -->
        (x_1 ** 2 + ... + x_N ** 2)  -->  sqrt((x_1 ** 2 + ... + x_N ** 2))
    ```

  When we backprop through this forward pass, the `sqrt` yields an `inf` because
  `grad_z(sqrt(z)) = 1 / (2 * sqrt(z))`. Continuing the backprop to the left, at
  the `x ** 2` term, we pick up a `2 * x`, and when `x` is zero, we get
  `0 * inf`, which is `NaN`.

  We'd like to avoid these `NaN`s, since they infect the rest of the connected
  computation graph. Practically, when two inputs to a kernel function are
  equal, we are in one of two scenarios:
    1. We are actually computing k(x, x), in which case norm(x - x) is
       identically zero, independent of x. In this case, we'd like the
       gradient to reflect this independence: it should be zero.
    2. We are computing k(x, y), and x just *happens* to have the same value
       as y. The gradient at such inputs is in fact ill-defined (there is a
       cusp in the sqrt((x - y) ** 2) surface along the line x = y). There are,
       however, an infinite number of sub-gradients, all of which are valid at
       all such inputs. By symmetry, there is exactly one which is "special":
       zero, and we elect to use that value here. In practice, having two
       identical inputs to a kernel matrix is probably a pathological
       situation to be avoided, but that is better resolved at a higher level
       than this.

  To avoid the infinite gradient at zero, we use tf.custom_gradient to redefine
  the gradient at zero. We assign it to be a very large value, specifically
  the sqrt of the max value of the floating point dtype of the input. We use
  the sqrt (as opposed to just using the max floating point value) to avoid
  potential overflow when combining this value with others downstream.
  """
    with tf.name_scope(name or 'sqrt_with_finite_grads'):
        x = tf.convert_to_tensor(value=x, name='x')
        if not x.dtype.is_floating:
            raise TypeError('Input `x` must be floating type.')

        def grad(grad_ys):
            large_float_like_x = np.sqrt(
                np.finfo(dtype_util.as_numpy_dtype(x.dtype)).max)
            safe_grads = tf.where(tf.equal(x, 0), large_float_like_x,
                                  0.5 * tf.math.rsqrt(x))
            return grad_ys * safe_grads

        return tf.sqrt(x), grad
Ejemplo n.º 28
0
 def testAssertParamsAreFloats(self):
     loc = tf.convert_to_tensor(0, dtype=tf.int32)
     scale = tf.convert_to_tensor(1, dtype=tf.int32)
     with self.assertRaisesRegexp(ValueError, 'Expected floating point'):
         tfd.Laplace(loc=loc, scale=scale)
Ejemplo n.º 29
0
def soft_threshold(x, threshold, name=None):
    """Soft Thresholding operator.

  This operator is defined by the equations

  ```none
                                { x[i] - gamma,  x[i] >   gamma
  SoftThreshold(x, gamma)[i] =  { 0,             x[i] ==  gamma
                                { x[i] + gamma,  x[i] <  -gamma
  ```

  In the context of proximal gradient methods, we have

  ```none
  SoftThreshold(x, gamma) = prox_{gamma L1}(x)
  ```

  where `prox` is the proximity operator.  Thus the soft thresholding operator
  is used in proximal gradient descent for optimizing a smooth function with
  (non-smooth) L1 regularization, as outlined below.

  The proximity operator is defined as:

  ```none
  prox_r(x) = argmin{ r(z) + 0.5 ||x - z||_2**2 : z },
  ```

  where `r` is a (weakly) convex function, not necessarily differentiable.
  Because the L2 norm is strictly convex, the above argmin is unique.

  One important application of the proximity operator is as follows.  Let `L` be
  a convex and differentiable function with Lipschitz-continuous gradient.  Let
  `R` be a convex lower semicontinuous function which is possibly
  nondifferentiable.  Let `gamma` be an arbitrary positive real.  Then

  ```none
  x_star = argmin{ L(x) + R(x) : x }
  ```

  if and only if the fixed-point equation is satisfied:

  ```none
  x_star = prox_{gamma R}(x_star - gamma grad L(x_star))
  ```

  Proximal gradient descent thus typically consists of choosing an initial value
  `x^{(0)}` and repeatedly applying the update

  ```none
  x^{(k+1)} = prox_{gamma^{(k)} R}(x^{(k)} - gamma^{(k)} grad L(x^{(k)}))
  ```

  where `gamma` is allowed to vary from iteration to iteration.  Specializing to
  the case where `R(x) = ||x||_1`, we minimize `L(x) + ||x||_1` by repeatedly
  applying the update

  ```
  x^{(k+1)} = SoftThreshold(x - gamma grad L(x^{(k)}), gamma)
  ```

  (This idea can also be extended to second-order approximations, although the
  multivariate case does not have a known closed form like above.)

  Args:
    x: `float` `Tensor` representing the input to the SoftThreshold function.
    threshold: nonnegative scalar, `float` `Tensor` representing the radius of
      the interval on which each coordinate of SoftThreshold takes the value
      zero.  Denoted `gamma` above.
    name: Python string indicating the name of the TensorFlow operation.
      Default value: `'soft_threshold'`.

  Returns:
    softthreshold: `float` `Tensor` with the same shape and dtype as `x`,
      representing the value of the SoftThreshold function.

  #### References

  [1]: Yu, Yao-Liang. The Proximity Operator.
       https://www.cs.cmu.edu/~suvrit/teach/yaoliang_proximity.pdf

  [2]: Wikipedia Contributors. Proximal gradient methods for learning.
       _Wikipedia, The Free Encyclopedia_, 2018.
       https://en.wikipedia.org/wiki/Proximal_gradient_methods_for_learning

  """
    # https://math.stackexchange.com/questions/471339/derivation-of-soft-thresholding-operator
    with tf.name_scope(name or 'soft_threshold'):
        x = tf.convert_to_tensor(x, name='x')
        threshold = tf.convert_to_tensor(threshold,
                                         dtype=x.dtype,
                                         name='threshold')
        return tf.sign(x) * tf.maximum(tf.abs(x) - threshold, 0.)
Ejemplo n.º 30
0
    def __init__(self,
                 maturity_dates: types.DateTensor,
                 discount_factors: tf.Tensor,
                 valuation_date: types.DateTensor,
                 interpolator: Optional[_InterpolationMethod] = None,
                 interpolate_rates: Optional[bool] = True,
                 daycount_convention: Optional[
                     _DayCountConventionsProtoType] = None,
                 curve_type: Optional[curve_types.CurveType] = None,
                 dtype: Optional[tf.DType] = None,
                 name: Optional[str] = None):
        """Initializes the interest rate curve.

    Args:
      maturity_dates: A `DateTensor` containing the maturity dates on which the
        curve is specified.
      discount_factors: A `Tensor` of real dtype specifying the discount factors
        corresponding to the input maturities. The shape of this input should
        match the shape of `maturity_dates`.
      valuation_date: A scalar `DateTensor` specifying the valuation (or
        settlement) date for the curve.
      interpolator: An instance of `InterpolationMethod`.
        Default value: `None` in which case cubic interpolation is used.
      interpolate_rates: A boolean specifying whether the interpolation should
        be done in discount rates or discount factors space.
        Default value: `True`, i.e., interpolation is done in the discount
        factors space.
      daycount_convention: `DayCountConventions` to use for the interpolation
        purpose.
        Default value: `None` which maps to actual/365 day count convention.
      curve_type: An instance of `CurveTypes` to mark the rate curve.
        Default value: `None` which means that the curve does not have the
          marker.
      dtype: `tf.Dtype`. Optional input specifying the dtype of the `rates`
        input.
      name: Python str. The name to give to the ops created by this function.
        Default value: `None` which maps to 'rate_curve'.
    """
        self._name = name or "rate_curve"
        with tf.compat.v1.name_scope(self._name):
            self._discount_factor_nodes = tf.convert_to_tensor(
                discount_factors, dtype=dtype, name="curve_discount_factors")
            self._dtype = dtype or self._discount_factor_nodes.dtype
            if interpolator is None or interpolator == _InterpolationMethod.CUBIC:

                def cubic_interpolator(xi, x, y):
                    spline_coeffs = math.interpolation.cubic.build_spline(x, y)
                    return math.interpolation.cubic.interpolate(xi,
                                                                spline_coeffs,
                                                                dtype=dtype)

                interpolator = cubic_interpolator
                self._interpolation_method = _InterpolationMethod.CUBIC
            elif interpolator == _InterpolationMethod.LINEAR:

                def linear_interpolator(xi, x, y):
                    return math.interpolation.linear.interpolate(xi,
                                                                 x,
                                                                 y,
                                                                 dtype=dtype)

                interpolator = linear_interpolator
                self._interpolation_method = _InterpolationMethod.LINEAR
            elif interpolator == _InterpolationMethod.CONSTANT_FORWARD:

                def constant_fwd(xi, x, y):
                    return rates_lib.constant_fwd.interpolate(xi,
                                                              x,
                                                              y,
                                                              dtype=dtype)

                interpolator = constant_fwd
                self._interpolation_method = _InterpolationMethod.CONSTANT_FORWARD
            else:
                raise ValueError(
                    f"Unknown interpolation method {interpolator}.")
            self._dates = dateslib.convert_to_date_tensor(maturity_dates)
            self._valuation_date = dateslib.convert_to_date_tensor(
                valuation_date)

            self._daycount_convention = (daycount_convention
                                         or _DayCountConventions.ACTUAL_365)
            self._day_count_fn = utils.get_daycount_fn(
                self._daycount_convention)
            self._times = self._get_time(self._dates)
            self._interpolator = interpolator
            self._interpolate_rates = interpolate_rates
            # Precompute discount rates:
            self._curve_type = curve_type