def _do_update(x_update_diff_norm_sq, x_update,
                           hess_matmul_x_update):  # pylint: disable=missing-docstring
                hessian_column_with_l2 = sparse_or_dense_matvecmul(
                    hessian_unregularized_loss_outer,
                    hessian_unregularized_loss_middle *
                    _sparse_or_dense_matmul_onehot(
                        hessian_unregularized_loss_outer, coord),
                    adjoint_a=True)

                if l2_regularizer is not None:
                    hessian_column_with_l2 += _one_hot_like(
                        hessian_column_with_l2,
                        coord,
                        on_value=2. * l2_regularizer)

                # Move the batch dimensions of `hessian_column_with_l2` to rightmost in
                # order to conform to `hess_matmul_x_update`.
                n = tf.rank(hessian_column_with_l2)
                perm = tf.roll(tf.range(n), shift=1, axis=0)
                hessian_column_with_l2 = tf.transpose(a=hessian_column_with_l2,
                                                      perm=perm)

                # Update the entire batch at `coord` even if `delta` may be 0 at some
                # batch coordinates. In those cases, adding `delta` is a no-op.
                x_update = tf.tensor_scatter_nd_add(x_update, [[coord]],
                                                    [delta])

                with tf.control_dependencies([x_update]):
                    x_update_diff_norm_sq_ = x_update_diff_norm_sq + delta**2
                    hess_matmul_x_update_ = (hess_matmul_x_update +
                                             delta * hessian_column_with_l2)

                    # Hint that loop vars retain the same shape.
                    x_update_diff_norm_sq_.set_shape(
                        x_update_diff_norm_sq_.shape.merge_with(
                            x_update_diff_norm_sq.shape))
                    hess_matmul_x_update_.set_shape(
                        hess_matmul_x_update_.shape.merge_with(
                            hess_matmul_x_update.shape))

                    return [
                        x_update_diff_norm_sq_, x_update, hess_matmul_x_update_
                    ]
Esempio n. 2
0
def _grad_neg_log_likelihood_and_fim(model_matrix, linear_response, response,
                                     model):
    """Computes the neg-log-likelihood gradient and Fisher information for a GLM.

  Note that Fisher information is related to the Hessian of the log-likelihood
  by the equation

  ```none
  FisherInfo = E[Hessian with respect to model_coefficients of -LogLikelihood(
      Y | model_matrix, model_coefficients)]
  ```

  where `LogLikelihood` is the log-likelihood of a generalized linear model
  parameterized by `model_matrix` and `model_coefficients`, and the expectation
  is taken over Y, distributed according to the same GLM with the same parameter
  values.

  Args:
    model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor`
      where each row represents a sample's features.  Has shape `[N, n]` where
      `N` is the number of data samples and `n` is the number of features per
      sample.
    linear_response: (Batch of) vector-shaped `Tensor` with the same dtype as
      `model_matrix`, equal to `model_matix @ model_coefficients` where
      `model_coefficients` are the coefficients of the linear component of the
      GLM.
    response: (Batch of) vector-shaped `Tensor` with the same dtype as
      `model_matrix` where each element represents a sample's observed response
      (to the corresponding row of features).
    model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link
      function and distribution of the GLM, and thus characterizes the negative
      log-likelihood. Must have sufficient statistic equal to the response, that
      is, `T(y) = y`.

  Returns:
    grad_neg_log_likelihood: (Batch of) vector-shaped `Tensor` with the same
      shape and dtype as a single row of `model_matrix`, representing the
      gradient of the negative log likelihood of `response` given linear
      response `linear_response`.
    fim_middle: (Batch of) vector-shaped `Tensor` with the same shape and dtype
      as a single column of `model_matrix`, satisfying the equation
      `Fisher information =
      Transpose(model_matrix)
      @ diag(fim_middle)
      @ model_matrix`.
  """
    # TODO(b/111926503): Determine whether there are some practical cases where it
    # is computationally favorable to compute the full FIM.
    mean, variance, grad_mean = model(linear_response)

    is_valid = (tf.math.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.)
                & tf.math.is_finite(variance) & (variance > 0.))

    def _mask_if_invalid(x, mask):
        return tf.where(is_valid, x,
                        np.array(mask, dtype_util.as_numpy_dtype(x.dtype)))

    # TODO(b/111923449): Link to derivation once it's available.
    v = (response - mean) * _mask_if_invalid(grad_mean, 1) / _mask_if_invalid(
        variance, np.inf)
    grad_log_likelihood = sparse_or_dense_matvecmul(model_matrix,
                                                    v,
                                                    adjoint_a=True)
    fim_middle = _mask_if_invalid(grad_mean, 0.)**2 / _mask_if_invalid(
        variance, np.inf)
    return -grad_log_likelihood, fim_middle
Esempio n. 3
0
 def _neg_log_likelihood(x):
   predicted_linear_response = sparse_or_dense_matvecmul(model_matrix, x)
   log_probs = model.log_prob(response, predicted_linear_response)
   return -log_probs
Esempio n. 4
0
 def _grad_neg_log_likelihood_and_fim_fn(x):
   predicted_linear_response = sparse_or_dense_matvecmul(model_matrix, x)
   g, h_middle = _grad_neg_log_likelihood_and_fim(
       model_matrix, predicted_linear_response, response, model)
   return g, model_matrix, h_middle
Esempio n. 5
0
def fit_sparse_one_step(model_matrix,
                        response,
                        model,
                        model_coefficients_start,
                        tolerance,
                        l1_regularizer,
                        l2_regularizer=None,
                        maximum_full_sweeps=None,
                        learning_rate=None,
                        name=None):
  """One step of (the outer loop of) the GLM fitting algorithm.

  This function returns a new value of `model_coefficients`, equal to
  `model_coefficients_start + model_coefficients_update`.  The increment
  `model_coefficients_update in R^n` is computed by a coordinate descent method,
  that is, by a loop in which each iteration updates exactly one coordinate of
  `model_coefficients_update`.  (Some updates may leave the value of the
  coordinate unchanged.)

  The particular update method used is to apply an L1-based proximity operator,
  "soft threshold", whose fixed point `model_coefficients_update^*` is the
  desired minimum

  ```none
  model_coefficients_update^* = argmin{
      -LogLikelihood(model_coefficients_start + model_coefficients_update')
        + l1_regularizer *
            ||model_coefficients_start + model_coefficients_update'||_1
        + l2_regularizer *
            ||model_coefficients_start + model_coefficients_update'||_2**2
      : model_coefficients_update' }
  ```

  where in each iteration `model_coefficients_update'` has at most one nonzero
  coordinate.

  This update method preserves sparsity, i.e., tends to find sparse solutions if
  `model_coefficients_start` is sparse.  Additionally, the choice of step size
  is based on curvature (Fisher information matrix), which significantly speeds
  up convergence.

  Args:
    model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor`
      where each row represents a sample's features.  Has shape `[N, n]` where
      `N` is the number of data samples and `n` is the number of features per
      sample.
    response: (Batch of) vector-shaped `Tensor` with the same dtype as
      `model_matrix` where each element represents a sample's observed response
      (to the corresponding row of features).
    model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link
      function and distribution of the GLM, and thus characterizes the negative
      log-likelihood which will be minimized. Must have sufficient statistic
      equal to the response, that is, `T(y) = y`.
    model_coefficients_start: (Batch of) vector-shaped, `float` `Tensor` with
      the same dtype as `model_matrix`, representing the initial values of the
      coefficients for the GLM regression.  Has shape `[n]` where `model_matrix`
      has shape `[N, n]`.
    tolerance: scalar, `float` `Tensor` representing the convergence threshold.
      The optimization step will terminate early, returning its current value of
      `model_coefficients_start + model_coefficients_update`, once the following
      condition is met:
      `||model_coefficients_update_end - model_coefficients_update_start||_2
         / (1 + ||model_coefficients_start||_2)
       < sqrt(tolerance)`,
      where `model_coefficients_update_end` is the value of
      `model_coefficients_update` at the end of a sweep and
      `model_coefficients_update_start` is the value of
      `model_coefficients_update` at the beginning of that sweep.
    l1_regularizer: scalar, `float` `Tensor` representing the weight of the L1
      regularization term (see equation above).
    l2_regularizer: scalar, `float` `Tensor` representing the weight of the L2
      regularization term (see equation above).
      Default value: `None` (i.e., no L2 regularization).
    maximum_full_sweeps: Python integer specifying maximum number of sweeps to
      run.  A "sweep" consists of an iteration of coordinate descent on each
      coordinate. After this many sweeps, the algorithm will terminate even if
      convergence has not been reached.
      Default value: `1`.
    learning_rate: scalar, `float` `Tensor` representing a multiplicative factor
      used to dampen the proximal gradient descent steps.
      Default value: `None` (i.e., factor is conceptually `1`).
    name: Python string representing the name of the TensorFlow operation. The
      default name is `"fit_sparse_one_step"`.

  Returns:
    model_coefficients: (Batch of) `Tensor` having the same shape and dtype as
      `model_coefficients_start`, representing the updated value of
      `model_coefficients`, that is, `model_coefficients_start +
      model_coefficients_update`.
    is_converged: scalar, `bool` `Tensor` indicating whether convergence
      occurred across all batches within the specified number of sweeps.
    iter: scalar, `int` `Tensor` representing the actual number of coordinate
      updates made (before achieving convergence).  Since each sweep consists of
      `tf.size(model_coefficients_start)` iterations, the maximum number of
      updates is `maximum_full_sweeps * tf.size(model_coefficients_start)`.
  """
  graph_deps = [
      model_matrix,
      response,
      model_coefficients_start,
      l1_regularizer,
      l2_regularizer,
      maximum_full_sweeps,
      tolerance,
      learning_rate,
  ]
  with tf.name_scope(name, 'fit_sparse_one_step', graph_deps):
    predicted_linear_response = sparse_or_dense_matvecmul(
        model_matrix, model_coefficients_start)
    g, h_middle = _grad_neg_log_likelihood_and_fim(
        model_matrix, predicted_linear_response, response, model)

    return tfp.optimizer.proximal_hessian_sparse_one_step(
        gradient_unregularized_loss=g,
        hessian_unregularized_loss_outer=model_matrix,
        hessian_unregularized_loss_middle=h_middle,
        x_start=model_coefficients_start,
        l1_regularizer=l1_regularizer,
        l2_regularizer=l2_regularizer,
        maximum_full_sweeps=maximum_full_sweeps,
        tolerance=tolerance,
        learning_rate=learning_rate,
        name=name)