Example #1
0
def _is_finite(arg1, *args):
  """Checks if the supplied tensors are finite.

  Args:
    arg1: A numeric `Tensor`.
    *args: (Optional) Other `Tensors` to check for finiteness.

  Returns:
    is_finite: Scalar boolean `Tensor` indicating whether all the supplied
      tensors are finite.
  """
  finite = tf.reduce_all(tf.is_finite(arg1))
  for arg in args:
    finite = finite & tf.reduce_all(tf.is_finite(arg))
  return finite
Example #2
0
def aggregate_single_gradient(grad_and_vars, use_mean, check_inf_nan):
    """Calculate the average gradient for a shared variable across all towers.

  Note that this function provides a synchronization point across all towers.

  Args:
    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
      (gradient, variable) pair within the outer list represents the gradient
      of the variable calculated for a single tower, and the number of pairs
      equals the number of towers.
    use_mean: if True, mean is taken, else sum of gradients is taken.
    check_inf_nan: check grads for nans and infs.

  Returns:
    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
      gradient has been averaged across all towers. The variable is chosen from
      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
  """
    grads = [g for g, _ in grad_and_vars]
    grad = tf.add_n(grads)

    if use_mean and len(grads) > 1:
        grad = tf.multiply(grad, 1.0 / len(grads))

    v = grad_and_vars[0][1]
    if check_inf_nan:
        has_nan_or_inf = tf.logical_not(tf.reduce_all(tf.is_finite(grads)))
        return (grad, v), has_nan_or_inf
    else:
        return (grad, v), None
Example #3
0
def correlation_loss(source_samples, target_samples, weight, scope=None):
  """Adds a similarity loss term, the correlation between two representations.

  Args:
    source_samples: a tensor of shape [num_samples, num_features]
    target_samples: a tensor of shape [num_samples, num_features]
    weight: a scalar weight for the loss.
    scope: optional name scope for summary tags.

  Returns:
    a scalar tensor representing the correlation loss value.
  """
  with tf.name_scope('corr_loss'):
    source_samples -= tf.reduce_mean(source_samples, 0)
    target_samples -= tf.reduce_mean(target_samples, 0)

    source_samples = tf.nn.l2_normalize(source_samples, 1)
    target_samples = tf.nn.l2_normalize(target_samples, 1)

    source_cov = tf.matmul(tf.transpose(source_samples), source_samples)
    target_cov = tf.matmul(tf.transpose(target_samples), target_samples)

    corr_loss = tf.reduce_mean(tf.square(source_cov - target_cov)) * weight

  assert_op = tf.Assert(tf.is_finite(corr_loss), [corr_loss])
  with tf.control_dependencies([assert_op]):
    tag = 'Correlation Loss'
    if scope:
      tag = scope + tag
    tf.summary.scalar(tag, corr_loss)
    tf.losses.add_loss(corr_loss)

  return corr_loss
Example #4
0
def difference_loss(private_samples, shared_samples, weight=1.0, name=''):
  """Adds the difference loss between the private and shared representations.

  Args:
    private_samples: a tensor of shape [num_samples, num_features].
    shared_samples: a tensor of shape [num_samples, num_features].
    weight: the weight of the incoherence loss.
    name: the name of the tf summary.
  """
  private_samples -= tf.reduce_mean(private_samples, 0)
  shared_samples -= tf.reduce_mean(shared_samples, 0)

  private_samples = tf.nn.l2_normalize(private_samples, 1)
  shared_samples = tf.nn.l2_normalize(shared_samples, 1)

  correlation_matrix = tf.matmul(
      private_samples, shared_samples, transpose_a=True)

  cost = tf.reduce_mean(tf.square(correlation_matrix)) * weight
  cost = tf.where(cost > 0, cost, 0, name='value')

  tf.summary.scalar('losses/Difference Loss {}'.format(name),
                                       cost)
  assert_op = tf.Assert(tf.is_finite(cost), [cost])
  with tf.control_dependencies([assert_op]):
    tf.losses.add_loss(cost)
Example #5
0
def mmd_loss(source_samples, target_samples, weight, scope=None):
  """Adds a similarity loss term, the MMD between two representations.

  This Maximum Mean Discrepancy (MMD) loss is calculated with a number of
  different Gaussian kernels.

  Args:
    source_samples: a tensor of shape [num_samples, num_features].
    target_samples: a tensor of shape [num_samples, num_features].
    weight: the weight of the MMD loss.
    scope: optional name scope for summary tags.

  Returns:
    a scalar tensor representing the MMD loss value.
  """
  sigmas = [
      1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 20, 25, 30, 35, 100,
      1e3, 1e4, 1e5, 1e6
  ]
  gaussian_kernel = partial(
      utils.gaussian_kernel_matrix, sigmas=tf.constant(sigmas))

  loss_value = maximum_mean_discrepancy(
      source_samples, target_samples, kernel=gaussian_kernel)
  loss_value = tf.maximum(1e-4, loss_value) * weight
  assert_op = tf.Assert(tf.is_finite(loss_value), [loss_value])
  with tf.control_dependencies([assert_op]):
    tag = 'MMD Loss'
    if scope:
      tag = scope + tag
    tf.summary.scalar(tag, loss_value)
    tf.losses.add_loss(loss_value)

  return loss_value
Example #6
0
def _quaternion_loss(labels, predictions, weight, batch_size, domain,
                     add_summaries):
  """Creates a Quaternion Loss.

  Args:
    labels: The true quaternions.
    predictions: The predicted quaternions.
    weight: A scalar weight.
    batch_size: The size of the batches.
    domain: The name of the domain from which the labels were taken.
    add_summaries: Whether or not to add summaries for the losses.

  Returns:
    A `Tensor` representing the loss.
  """
  assert domain in ['Source', 'Transferred']

  params = {'use_logging': False, 'batch_size': batch_size}
  loss = weight * log_quaternion_loss(labels, predictions, params)

  if add_summaries:
    assert_op = tf.Assert(tf.is_finite(loss), [loss])
    with tf.control_dependencies([assert_op]):
      tf.summary.histogram(
          'Log_Quaternion_Loss_%s' % domain, loss, collections='losses')
      tf.summary.scalar(
          'Task_Quaternion_Loss_%s' % domain, loss, collections='losses')

  return loss
Example #7
0
def safe_sum(x, alt_value=-np.inf, name=None):
  """Elementwise adds list members, replacing non-finite results with alt_value.

  Args:
    x: Python `list` of `Tensors` to elementwise add.
    alt_value: Python scalar used to replace any elementwise sums which would
      otherwise be non-finite.
    name: Python `str` name prefixed to Ops created by this function.
      Default value: `None` (i.e., "safe_sum").

  Returns:
    safe_sum: `Tensor` representing the elementwise sum of list of `Tensor`s
      `x` or `alt_value` where sums are non-finite.

  Raises:
    TypeError: if `x` is not list-like.
    ValueError: if `x` is empty.
  """
  with tf.name_scope(name, 'safe_sum', [x, alt_value]):
    if not is_list_like(x):
      raise TypeError('Expected list input.')
    if not x:
      raise ValueError('Input should not be empty.')
    n = np.int32(len(x))
    in_shape = x[0].shape
    x = tf.stack(x, axis=-1)
    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.  Thus
    # we will replace such rows with the `alt_value`. Typically the `alt_value`
    # is chosen so the `MetropolisHastings` `TransitionKernel` always rejects
    # the proposal.  rejection.
    # Regarding the following float-comparisons, recall comparing with NaN is
    # always False, i.e., we're implicitly capturing NaN and explicitly
    # capturing +/- Inf.
    is_sum_determinate = (
        tf.reduce_all(tf.is_finite(x) | (x >= 0.), axis=-1) &
        tf.reduce_all(tf.is_finite(x) | (x <= 0.), axis=-1))
    is_sum_determinate = tf.tile(
        is_sum_determinate[..., tf.newaxis],
        multiples=tf.concat([tf.ones(tf.rank(x) - 1, dtype=tf.int32), [n]],
                            axis=0))
    alt_value = np.array(alt_value, x.dtype.as_numpy_dtype)
    x = tf.where(is_sum_determinate, x, tf.fill(tf.shape(x), value=alt_value))
    x = tf.reduce_sum(x, axis=-1)
    x.set_shape(x.shape.merge_with(in_shape))
    return x
Example #8
0
  def exponential_moving_average(self,
                                 var,
                                 avg_var=None,
                                 decay=0.999,
                                 ignore_nan=False):
    """Calculates the exponential moving average.

    TODO(): check if this implementation of moving average can now
    be replaced by tensorflows implementation.

    Adds a variable to keep track of the exponential moving average and adds an
    update operation to the bookkeeper. The name of the variable is
    '%s_average' % name prefixed with the current variable scope.

    Args:
       var: The variable for which a moving average should be computed.
       avg_var: The variable to set the average into, if None create a zero
         initialized one.
       decay: How much history to use in the moving average.
         Higher, means more history values [0, 1) accepted.
       ignore_nan: If the value is NaN or Inf, skip it.
    Returns:
       The averaged variable.
    Raises:
      ValueError: if decay is not in [0, 1).
    """
    with self._g.as_default():
      if decay < 0 or decay >= 1.0:
        raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay)
      if avg_var is None:
        avg_name = '%s_average' % _bare_var_name(var)
        with tf.control_dependencies(None):
          with tf.name_scope(avg_name + '/Initializer/'):
            if isinstance(var, tf.Variable):
              init_val = var.initialized_value()
            elif var.get_shape().is_fully_defined():
              init_val = tf.constant(0,
                                     shape=var.get_shape(),
                                     dtype=var.dtype.base_dtype)
            else:
              init_val = tf.constant(0, dtype=var.dtype.base_dtype)
          avg_var = tf.Variable(init_val, name=avg_name, trainable=False)

      num_updates = tf.cast(self.global_step, tf.float32)
      decay = tf.minimum(decay, tf.maximum(0.9, (1.0 + num_updates) /
                                           (10.0 + num_updates)))
      with tf.device(avg_var.device):
        if ignore_nan:
          var = tf.where(tf.is_finite(var), var, avg_var)
        if var.get_shape().is_fully_defined():
          avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var))
        else:
          avg_update = tf.assign(avg_var,
                                 avg_var - (1 - decay) * (avg_var - var),
                                 validate_shape=False)
      self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update)
      return avg_update
Example #9
0
  def _compute_accept_prob(self, position, momentum, position_post,
                           momentum_post, sumlogdet):
    """Compute the prob of accepting the proposed state given old state."""

    old_hamil = self.hamiltonian(position, momentum)
    new_hamil = self.hamiltonian(position_post, momentum_post)
    prob = tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.))

    # Ensure numerical stability as well as correct gradients
    return tf.where(tf.is_finite(prob), prob, tf.zeros_like(prob))
Example #10
0
 def _compare(self, x, use_gpu):
     np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
     with self.test_session(use_gpu=use_gpu) as sess:
         inx = tf.convert_to_tensor(x)
         ofinite, oinf, onan = tf.is_finite(inx), tf.is_inf(inx), tf.is_nan(inx)
         tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan])
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
     self.assertShapeEqual(np_inf, oinf)
     self.assertShapeEqual(np_nan, onan)
     self.assertShapeEqual(np_finite, ofinite)
Example #11
0
 def _loss(self, predictions):
     with tf.name_scope("loss"):
         # if training then crop center of y, else, padding was applied
         slice_amt = (np.sum(self.filter_sizes) - len(self.filter_sizes)) / 2
         slice_y = self.y_norm[:,slice_amt:-slice_amt, slice_amt:-slice_amt]
         _y = tf.cond(self.is_training, lambda: slice_y, lambda: self.y_norm)
         tf.subtract(predictions, _y)
         err = tf.square(predictions - _y)
         err_filled = utils.fill_na(err, 0)
         finite_count = tf.reduce_sum(tf.cast(tf.is_finite(err), tf.float32))
         mse = tf.reduce_sum(err_filled) / finite_count
         return mse
Example #12
0
def dann_loss(source_samples, target_samples, weight, scope=None):
  """Adds the domain adversarial (DANN) loss.

  Args:
    source_samples: a tensor of shape [num_samples, num_features].
    target_samples: a tensor of shape [num_samples, num_features].
    weight: the weight of the loss.
    scope: optional name scope for summary tags.

  Returns:
    a scalar tensor representing the correlation loss value.
  """
  with tf.variable_scope('dann'):
    batch_size = tf.shape(source_samples)[0]
    samples = tf.concat(axis=0, values=[source_samples, target_samples])
    samples = slim.flatten(samples)

    domain_selection_mask = tf.concat(
        axis=0, values=[tf.zeros((batch_size, 1)), tf.ones((batch_size, 1))])

    # Perform the gradient reversal and be careful with the shape.
    grl = grl_ops.gradient_reversal(samples)
    grl = tf.reshape(grl, (-1, samples.get_shape().as_list()[1]))

    grl = slim.fully_connected(grl, 100, scope='fc1')
    logits = slim.fully_connected(grl, 1, activation_fn=None, scope='fc2')

    domain_predictions = tf.sigmoid(logits)

  domain_loss = tf.losses.log_loss(
      domain_selection_mask, domain_predictions, weights=weight)

  domain_accuracy = utils.accuracy(
      tf.round(domain_predictions), domain_selection_mask)

  assert_op = tf.Assert(tf.is_finite(domain_loss), [domain_loss])
  with tf.control_dependencies([assert_op]):
    tag_loss = 'losses/domain_loss'
    tag_accuracy = 'losses/domain_accuracy'
    if scope:
      tag_loss = scope + tag_loss
      tag_accuracy = scope + tag_accuracy

    tf.summary.scalar(tag_loss, domain_loss)
    tf.summary.scalar(tag_accuracy, domain_accuracy)

  return domain_loss
def _create_autosummary_var(name, value_expr):
    assert not _autosummary_finalized
    v = tf.cast(value_expr, tf.float32)
    if v.shape.ndims is 0:
        v = [v, np.float32(1.0)]
    elif v.shape.ndims is 1:
        v = [tf.reduce_sum(v), tf.cast(tf.shape(v)[0], tf.float32)]
    else:
        v = [tf.reduce_sum(v), tf.reduce_prod(tf.cast(tf.shape(v), tf.float32))]
    v = tf.cond(tf.is_finite(v[0]), lambda: tf.stack(v), lambda: tf.zeros(2))
    with tf.control_dependencies(None):
        var = tf.Variable(tf.zeros(2)) # [numerator, denominator]
    update_op = tf.cond(tf.is_variable_initialized(var), lambda: tf.assign_add(var, v), lambda: tf.assign(var, v))
    if name in _autosummary_vars:
        _autosummary_vars[name].append(var)
    else:
        _autosummary_vars[name] = [var]
    return update_op
Example #14
0
def calc_loss(predictions, true_y, additional_mask=None):
    """
    Calculates losses, ignoring NaN true values (assigning zero loss to them)
    :param predictions: Predicted values
    :param true_y: True values
    :param additional_mask:
    :return: MAE loss, differentiable SMAPE loss, competition SMAPE loss
    """
    # Take into account NaN's in true values
    mask = tf.is_finite(true_y)
    # Fill NaNs by zeros (can use any value)
    true_y = tf.where(mask, true_y, tf.zeros_like(true_y))
    # Assign zero weight to NaNs
    weights = tf.to_float(mask)
    if additional_mask is not None:
        weights = weights * tf.expand_dims(additional_mask, axis=0)

    mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions, weights=weights)
    return mae_loss, smape_loss(true_y, predictions, weights), calc_smape_rounded(true_y, predictions,
                                                                                  weights), tf.size(true_y)
Example #15
0
def log_blend(inputs, weights):
  """Blends state in the log space.

  Args:
    inputs: A set of scalar states, one for each particle in each particle filter.
      Should be [num_samples, batch_size].
    weights: A set of weights used to blend the state. Each set of weights
      should be of dimension [num_samples] (one weight for each previous particle).
      There should be one set of weights for each new particle in each particle filter.
      Thus the shape should be [num_samples, batch_size, num_samples] where
      the first axis indexes new particle and the last axis indexes old particles.
  Returns:
    blended: The blended states, a tensor of shape [num_samples, batch_size].
  """
  raw_max = tf.reduce_max(inputs, axis=0, keepdims=True)
  my_max = tf.stop_gradient(
      tf.where(tf.is_finite(raw_max), raw_max, tf.zeros_like(raw_max))
  )
  # Don't ask.
  blended = tf.log(tf.einsum("ijk,kj->ij", weights, tf.exp(inputs - raw_max))) + my_max
  return blended
Example #16
0
    def loop_cond(itr, obj_accum, unused_params, unused_attend_params,
                  unused_flattened_states, unused_global_state, all_obj,
                  init_obj, *args):
      """Termination conditions of the sub-problem optimization loop."""
      del args  # unused

      cond1 = tf.less(itr, num_iter)  # We've run < num_iter times
      cond2 = tf.is_finite(obj_accum)  # The objective is still finite

      if self.obj_train_max_multiplier > 0:
        current_obj = tf.gather(all_obj, itr)
        # Account for negative init_obj too
        max_diff = (self.obj_train_max_multiplier - 1) * tf.abs(init_obj)
        max_obj = init_obj + max_diff
        # The objective is a reasonable multiplier of the original objective
        cond3 = tf.less(current_obj, max_obj)

        return tf.logical_and(tf.logical_and(cond1, cond2), cond3,
                              name="training_loop_cond")
      else:
        return tf.logical_and(cond1, cond2, name="training_loop_cond")
Example #17
0
  def exponential_moving_average(
      self, var, avg_var=None, decay=0.999, ignore_nan=False):
    """Calculates the exponential moving average.

    Adds a variable to keep track of the exponential moving average and adds an
    update operation to the bookkeeper. The name of the variable is
    '%s_average' % name prefixed with the current variable scope.

    Args:
       var: The variable for which a moving average should be computed.
       avg_var: The variable to set the average into, if None create a zero
         initialized one.
       decay: How much history to use in the moving average.
         Higher, means more history values [0, 1) accepted.
       ignore_nan: If the value is NaN or Inf, skip it.
    Returns:
       The averaged variable.
    Raises:
      ValueError: if decay is not in [0, 1).
    """
    with self.g.as_default():
      if decay < 0 or decay >= 1.0:
        raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay)
      if not avg_var:
        shape = var.get_shape()
        avg_name = '%s_average' % _bare_var_name(var)
        avg_var = tf.Variable(
            tf.zeros_initializer(shape=shape, dtype=var.dtype),
            name=avg_name,
            trainable=False)
      num_updates = tf.cast(self.global_step, tf.float32)
      decay = tf.maximum(
          0.9, tf.minimum(decay, (1.0 + num_updates) / (10.0 + num_updates)))
      with tf.device(avg_var.device):
        if ignore_nan:
          var = tf.select(tf.is_finite(var), var, avg_var)
        avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var))
      self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update)
      return avg_var
    def apply_updates(self):
        assert not self._updates_applied
        self._updates_applied = True
        devices = list(self._dev_grads.keys())
        total_grads = sum(len(grads) for grads in self._dev_grads.values())
        assert len(devices) >= 1 and total_grads >= 1
        ops = []
        with absolute_name_scope(self.scope):

            # Cast gradients to FP32 and calculate partial sum within each device.
            dev_grads = OrderedDict() # device => [(grad, var), ...]
            for dev_idx, dev in enumerate(devices):
                with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev):
                    sums = []
                    for gv in zip(*self._dev_grads[dev]):
                        assert all(v is gv[0][1] for g, v in gv)
                        g = [tf.cast(g, tf.float32) for g, v in gv]
                        g = g[0] if len(g) == 1 else tf.add_n(g)
                        sums.append((g, gv[0][1]))
                    dev_grads[dev] = sums

            # Sum gradients across devices.
            if len(devices) > 1:
                with tf.name_scope('SumAcrossGPUs'), tf.device(None):
                    for var_idx, grad_shape in enumerate(self._grad_shapes):
                        g = [dev_grads[dev][var_idx][0] for dev in devices]
                        if np.prod(grad_shape): # nccl does not support zero-sized tensors
                            g = tf.contrib.nccl.all_sum(g)
                        for dev, gg in zip(devices, g):
                            dev_grads[dev][var_idx] = (gg, dev_grads[dev][var_idx][1])

            # Apply updates separately on each device.
            for dev_idx, (dev, grads) in enumerate(dev_grads.items()):
                with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev):

                    # Scale gradients as needed.
                    if self.use_loss_scaling or total_grads > 1:
                        with tf.name_scope('Scale'):
                            coef = tf.constant(np.float32(1.0 / total_grads), name='coef')
                            coef = self.undo_loss_scaling(coef)
                            grads = [(g * coef, v) for g, v in grads]

                    # Check for overflows.
                    with tf.name_scope('CheckOverflow'):
                        grad_ok = tf.reduce_all(tf.stack([tf.reduce_all(tf.is_finite(g)) for g, v in grads]))

                    # Update weights and adjust loss scaling.
                    with tf.name_scope('UpdateWeights'):
                        opt = self._dev_opt[dev]
                        ls_var = self.get_loss_scaling_var(dev)
                        if not self.use_loss_scaling:
                            ops.append(tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op))
                        else:
                            ops.append(tf.cond(grad_ok,
                                lambda: tf.group(tf.assign_add(ls_var, self.loss_scaling_inc), opt.apply_gradients(grads)),
                                lambda: tf.group(tf.assign_sub(ls_var, self.loss_scaling_dec))))

                    # Report statistics on the last device.
                    if dev == devices[-1]:
                        with tf.name_scope('Statistics'):
                            ops.append(autosummary(self.id + '/learning_rate', self.learning_rate))
                            ops.append(autosummary(self.id + '/overflow_frequency', tf.where(grad_ok, 0, 1)))
                            if self.use_loss_scaling:
                                ops.append(autosummary(self.id + '/loss_scaling_log2', ls_var))

            # Initialize variables and group everything into a single op.
            self.reset_optimizer_state()
            init_uninited_vars(list(self._dev_ls_var.values()))
            return tf.group(*ops, name='TrainingOp')
Example #19
0
 def whether_proceed(grads):
     finits = _map(lambda grad: tf.reduce_all(tf.is_finite(grad)), grads)
     return tf.reduce_all(finits)
Example #20
0
 def _process(self, grads):
     for grad, var in grads:
         # TODO make assert work
         tf.Assert(tf.reduce_all(tf.is_finite(var)), [var])
     return grads
Example #21
0
def make_finite(t, replacement):
  """Replaces non-finite tensor values with the replacement value."""
  return tf.where(tf.is_finite(t), t, replacement)
Example #22
0
def minimize(value_and_gradients_function,
             initial_position,
             tolerance=1e-8,
             x_tolerance=0,
             f_relative_tolerance=0,
             initial_inverse_hessian_estimate=None,
             max_iterations=50,
             parallel_iterations=1,
             name=None):
  """Applies the BFGS algorithm to minimize a differentiable function.

  Performs unconstrained minimization of a differentiable function using the
  BFGS scheme. For details of the algorithm, see [Nocedal and Wright(2006)][1].

  ### Usage:

  The following example demonstrates the BFGS optimizer attempting to find the
  minimum for a simple two dimensional quadratic objective function.

  ```python
    minimum = np.array([1.0, 1.0])  # The center of the quadratic bowl.
    scales = np.array([2.0, 3.0])  # The scales along the two axes.

    # The objective function and the gradient.
    def quadratic(x):
      value = tf.reduce_sum(scales * (x - minimum) ** 2)
      return value, tf.gradients(value, x)[0]

    start = tf.constant([0.6, 0.8])  # Starting point for the search.
    optim_results = tfp.optimizer.bfgs_minimize(
        quadratic, initial_position=start, tolerance=1e-8)

    with tf.Session() as session:
      results = session.run(optim_results)
      # Check that the search converged
      assert(results.converged)
      # Check that the argmin is close to the actual value.
      np.testing.assert_allclose(results.position, minimum)
      # Print out the total number of function evaluations it took. Should be 6.
      print ("Function evaluations: %d" % results.num_objective_evaluations)
  ```

  ### References:
  [1]: Jorge Nocedal, Stephen Wright. Numerical Optimization. Springer Series in
    Operations Research. pp 136-140. 2006
    http://pages.mtu.edu/~struther/Courses/OLD/Sp2013/5630/Jorge_Nocedal_Numerical_optimization_267490.pdf

  Args:
    value_and_gradients_function:  A Python callable that accepts a point as a
      real `Tensor` and returns a tuple of `Tensor`s of real dtype containing
      the value of the function and its gradient at that point. The function
      to be minimized. The first component of the return value should be a
      real scalar `Tensor`. The second component (the gradient) should have the
      same shape as the input value to the function.
    initial_position: `Tensor` of real dtype. The starting point of the search
      procedure. Should be a point at which the function value and the gradient
      norm are finite.
    tolerance: Scalar `Tensor` of real dtype. Specifies the gradient tolerance
      for the procedure. If the supremum norm of the gradient vector is below
      this number, the algorithm is stopped.
    x_tolerance: Scalar `Tensor` of real dtype. If the absolute change in the
      position between one iteration and the next is smaller than this number,
      the algorithm is stopped.
    f_relative_tolerance: Scalar `Tensor` of real dtype. If the relative change
      in the objective value between one iteration and the next is smaller
      than this value, the algorithm is stopped.
    initial_inverse_hessian_estimate: Optional `Tensor` of the same dtype
      as the components of the output of the `value_and_gradients_function`.
      If specified, the shape should be `initial_position.shape` * 2.
      For example, if the shape of `initial_position` is `[n]`, then the
      acceptable shape of `initial_inverse_hessian_estimate` is as a square
      matrix of shape `[n, n]`.
      If the shape of `initial_position` is `[n, m]`, then the required shape
      is `[n, m, n, m]`.
      For the correctness of the algorithm, it is required that this parameter
      be symmetric and positive definite. Specifies the starting estimate for
      the inverse of the Hessian at the initial point. If not specified,
      the identity matrix is used as the starting estimate for the
      inverse Hessian.
    max_iterations: Scalar positive int32 `Tensor`. The maximum number of
      iterations for BFGS updates.
    parallel_iterations: Positive integer. The number of iterations allowed to
      run in parallel.
    name: (Optional) Python str. The name prefixed to the ops created by this
      function. If not supplied, the default name 'minimize' is used.

  Returns:
    optimizer_results: A namedtuple containing the following items:
      converged: Scalar boolean tensor indicating whether the minimum was
        found within tolerance.
      failed:  Scalar boolean tensor indicating whether a line search
        step failed to find a suitable step size satisfying Wolfe
        conditions. In the absence of any constraints on the
        number of objective evaluations permitted, this value will
        be the complement of `converged`. However, if there is
        a constraint and the search stopped due to available
        evaluations being exhausted, both `failed` and `converged`
        will be simultaneously False.
      num_objective_evaluations: The total number of objective
        evaluations performed.
      position: A tensor containing the last argument value found
        during the search. If the search converged, then
        this value is the argmin of the objective function.
      objective_value: A tensor containing the value of the objective
        function at the `position`. If the search converged, then this is
        the (local) minimum of the objective function.
      objective_gradient: A tensor containing the gradient of the objective
        function at the `position`. If the search converged the
        max-norm of this tensor should be below the tolerance.
      inverse_hessian_estimate: A tensor containing the inverse of the
        estimated Hessian.
  """
  with tf.name_scope(name, 'minimize', [initial_position,
                                        tolerance,
                                        initial_inverse_hessian_estimate]):
    initial_position = tf.convert_to_tensor(initial_position,
                                            name='initial_position')
    dtype = initial_position.dtype.base_dtype
    tolerance = tf.convert_to_tensor(tolerance, dtype=dtype,
                                     name='grad_tolerance')
    f_relative_tolerance = tf.convert_to_tensor(f_relative_tolerance,
                                                dtype=dtype,
                                                name='f_relative_tolerance')
    x_tolerance = tf.convert_to_tensor(x_tolerance,
                                       dtype=dtype,
                                       name='x_tolerance')
    max_iterations = tf.convert_to_tensor(max_iterations, name='max_iterations')

    domain_shape = distribution_util.prefer_static_shape(initial_position)

    if initial_inverse_hessian_estimate is None:
      inv_hessian_shape = tf.concat([domain_shape, domain_shape], 0)
      initial_inv_hessian = tf.eye(tf.size(initial_position), dtype=dtype)
      initial_inv_hessian = tf.reshape(initial_inv_hessian,
                                       inv_hessian_shape,
                                       name='initial_inv_hessian')
    else:
      initial_inv_hessian = tf.convert_to_tensor(
          initial_inverse_hessian_estimate,
          dtype=dtype,
          name='initial_inv_hessian')

    # If an initial inverse Hessian is supplied, ensure that it is positive
    # definite. The easiest way to validate this is to compute the Cholesky
    # decomposition. However, it seems that simply adding a control dependency
    # on the decomposition result is not enough to trigger it. We need to
    # add an assert on the result.
    if initial_inverse_hessian_estimate is not None:
      # The supplied Hessian may not be of rank 2. Reshape it so it is.
      initial_inv_hessian_sqr_mat = tf.reshape(
          initial_inverse_hessian_estimate,
          tf.stack([tf.size(initial_position),
                    tf.size(initial_position)], axis=0))
      # If the matrix is not positive definite, the Cholesky decomposition will
      # fail. Adding an assert on it ensures it will be triggered.
      cholesky_factor = tf.cholesky(initial_inv_hessian_sqr_mat)
      is_positive_definite = tf.reduce_all(tf.is_finite(cholesky_factor))
      asymmetry = tf.norm(initial_inv_hessian_sqr_mat -
                          tf.transpose(initial_inv_hessian_sqr_mat), np.inf)
      is_symmetric = tf.equal(asymmetry, 0)
      with tf.control_dependencies(
          [tf.Assert(is_positive_definite,
                     ['Initial inverse Hessian is not positive definite.',
                      initial_inverse_hessian_estimate]),
           tf.Assert(is_symmetric,
                     ['Initial inverse Hessian is not symmetric',
                      initial_inverse_hessian_estimate])]):
        f0, df0 = value_and_gradients_function(initial_position)
    else:
      f0, df0 = value_and_gradients_function(initial_position)

    initial_convergence = _initial_convergence_test(df0, tolerance)

    # The `state` here is a BfgsOptimizerResults tuple with values for the
    # current state of the algorithm computation.
    def _cond(state):
      """Stopping condition for the algorithm."""
      keep_going = tf.logical_not(state.converged | state.failed |
                                  (state.num_iterations >= max_iterations))
      return keep_going

    def _body(state):
      """Main optimization loop."""

      search_direction = _get_search_direction(state.inverse_hessian_estimate,
                                               state.objective_gradient)
      derivative_at_start_pt = tf.reduce_sum(state.objective_gradient *
                                             search_direction)
      # If the derivative at the start point is not negative, reset the
      # Hessian estimate and recompute the search direction.
      needs_reset = derivative_at_start_pt >= 0
      def _reset_search_dirn():
        search_direction = _get_search_direction(initial_inv_hessian,
                                                 state.objective_gradient)
        return search_direction, initial_inv_hessian

      search_direction, inv_hessian_estimate = tf.contrib.framework.smart_cond(
          needs_reset,
          true_fn=_reset_search_dirn,
          false_fn=lambda: (search_direction, state.inverse_hessian_estimate))
      line_search_value_grad_func = _restrict_along_direction(
          value_and_gradients_function, state.position, search_direction)
      derivative_at_start_pt = tf.reduce_sum(state.objective_gradient *
                                             search_direction)

      ls_result = linesearch.hager_zhang(
          line_search_value_grad_func,
          initial_step_size=tf.convert_to_tensor(1, dtype=dtype),
          objective_at_zero=state.objective_value,
          grad_objective_at_zero=derivative_at_start_pt)

      state_after_ls = _update_state(
          state,
          failed=~ls_result.converged,  # Fail if line search failed.
          num_iterations=state.num_iterations + 1,
          num_objective_evaluations=(
              state.num_objective_evaluations + ls_result.func_evals),
          inverse_hessian_estimate=inv_hessian_estimate)

      def _do_bfgs_update():
        state_updated = _update_position(
            value_and_gradients_function,
            state_after_ls,
            search_direction * ls_result.left_pt,
            tolerance, f_relative_tolerance, x_tolerance)

        # If not converged, update the Hessian.
        return tf.contrib.framework.smart_cond(
            state_updated.converged,
            lambda: state_updated,
            lambda: _update_inv_hessian(state_after_ls, state_updated))

      next_state = tf.contrib.framework.smart_cond(
          state_after_ls.failed,
          true_fn=lambda: state_after_ls,
          false_fn=_do_bfgs_update)
      return [next_state]

    initial_state = BfgsOptimizerResults(
        converged=initial_convergence,
        failed=False,
        num_iterations=tf.convert_to_tensor(0),
        num_objective_evaluations=1,
        position=initial_position,
        objective_value=f0,
        objective_gradient=df0,
        inverse_hessian_estimate=initial_inv_hessian)

    return tf.while_loop(_cond, _body, [initial_state],
                         parallel_iterations=parallel_iterations)[0]
Example #23
0
def fit_one_step(
    model_matrix,
    response,
    model,
    model_coefficients_start=None,
    predicted_linear_response_start=None,
    l2_regularizer=None,
    dispersion=None,
    offset=None,
    learning_rate=None,
    fast_unsafe_numerics=True,
    name=None):
  """Runs one step of Fisher scoring.

  Args:
    model_matrix: (Batch of) `float`-like, matrix-shaped `Tensor` where each row
      represents a sample's features.
    response: (Batch of) vector-shaped `Tensor` where each element represents a
      sample's observed response (to the corresponding row of features). Must
      have same `dtype` as `model_matrix`.
    model: `tfp.glm.ExponentialFamily`-like instance used to construct the
      negative log-likelihood loss, gradient, and expected Hessian (i.e., the
      Fisher information matrix).
    model_coefficients_start: Optional (batch of) vector-shaped `Tensor`
      representing the initial model coefficients, one for each column in
      `model_matrix`. Must have same `dtype` as `model_matrix`.
      Default value: Zeros.
    predicted_linear_response_start: Optional `Tensor` with `shape`, `dtype`
      matching `response`; represents `offset` shifted initial linear
      predictions based on `model_coefficients_start`.
      Default value: `offset` if `model_coefficients is None`, and
      `tf.linalg.matvec(model_matrix, model_coefficients_start) + offset`
      otherwise.
    l2_regularizer: Optional scalar `Tensor` representing L2 regularization
      penalty, i.e.,
      `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w||_2^2`.
      Default value: `None` (i.e., no L2 regularization).
    dispersion: Optional (batch of) `Tensor` representing `response` dispersion,
      i.e., as in, `p(y|theta) := exp((y theta - A(theta)) / dispersion)`.
      Must broadcast with rows of `model_matrix`.
      Default value: `None` (i.e., "no dispersion").
    offset: Optional `Tensor` representing constant shift applied to
      `predicted_linear_response`.  Must broadcast to `response`.
      Default value: `None` (i.e., `tf.zeros_like(response)`).
    learning_rate: Optional (batch of) scalar `Tensor` used to dampen iterative
      progress. Typically only needed if optimization diverges, should be no
      larger than `1` and typically very close to `1`.
      Default value: `None` (i.e., `1`).
    fast_unsafe_numerics: Optional Python `bool` indicating if solve should be
      based on Cholesky or QR decomposition.
      Default value: `True` (i.e., "prefer speed via Cholesky decomposition").
    name: Python `str` used as name prefix to ops created by this function.
      Default value: `"fit_one_step"`.

  Returns:
    model_coefficients: (Batch of) vector-shaped `Tensor`; represents the
      next estimate of the model coefficients, one for each column in
      `model_matrix`.
    predicted_linear_response: `response`-shaped `Tensor` representing linear
      predictions based on new `model_coefficients`, i.e.,
      `tf.linalg.matvec(model_matrix, model_coefficients_next) + offset`.
  """
  graph_deps = [model_matrix, response, model_coefficients_start,
                predicted_linear_response_start, dispersion, learning_rate]
  with tf.name_scope(name, 'fit_one_step', graph_deps):

    [
        model_matrix,
        response,
        model_coefficients_start,
        predicted_linear_response_start,
        offset,
    ] = prepare_args(
        model_matrix,
        response,
        model_coefficients_start,
        predicted_linear_response_start,
        offset)

    # Compute: mean, grad(mean, predicted_linear_response_start), and variance.
    mean, variance, grad_mean = model(predicted_linear_response_start)

    # If either `grad_mean` or `variance is non-finite or zero, then we'll
    # replace it with a value such that the row is zeroed out. Although this
    # procedure may seem circuitous, it is necessary to ensure this algorithm is
    # itself differentiable.
    is_valid = (tf.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.) &
                tf.is_finite(variance) & (variance > 0.))
    def mask_if_invalid(x, mask):
      mask = tf.fill(tf.shape(x), value=np.array(mask, x.dtype.as_numpy_dtype))
      return tf.where(is_valid, x, mask)

    # Run one step of iteratively reweighted least-squares.
    # Compute "`z`", the adjusted predicted linear response.
    # z = predicted_linear_response_start
    #     + learning_rate * (response - mean) / grad_mean
    z = (response - mean) / mask_if_invalid(grad_mean, 1.)
    # TODO(jvdillon): Rather than use learning rate, we should consider using
    # backtracking line search.
    if learning_rate is not None:
      z *= learning_rate[..., tf.newaxis]
    z += predicted_linear_response_start

    # Compute "`w`", the per-sample weight.
    if dispersion is not None:
      # For convenience, we'll now scale the variance by the dispersion factor.
      variance *= dispersion
    w = (mask_if_invalid(grad_mean, 0.) *
         tf.rsqrt(mask_if_invalid(variance, np.inf)))

    a = model_matrix * w[..., tf.newaxis]
    b = z * w
    # Solve `min{ || A @ model_coefficients - b ||_2**2 : model_coefficients }`
    # where `@` denotes `matmul`.

    if l2_regularizer is None:
      l2_regularizer = np.array(0, a.dtype.as_numpy_dtype)
    else:
      l2_regularizer_ = distribution_util.maybe_get_static_value(
          l2_regularizer, a.dtype.as_numpy_dtype)
      if l2_regularizer_ is not None:
        l2_regularizer = l2_regularizer_

    def _embed_l2_regularization():
      """Adds synthetic observations to implement L2 regularization."""
      # `tf.matrix_solve_ls` does not respect the `l2_regularization` argument
      # when `fast_unsafe_numerics` is `False`. This function  adds synthetic
      # observations to the data to implement the regularization instead.
      # Adding observations `sqrt(l2_regularizer) * I` is mathematically
      # equivalent to adding the term
      # `-l2_regularizer ||coefficients||_2**2` to the log-likelihood.
      num_model_coefficients = num_cols(model_matrix)
      batch_shape = tf.shape(model_matrix)[:-2]
      eye = tf.eye(
          num_model_coefficients, batch_shape=batch_shape, dtype=a.dtype)
      a_ = tf.concat([a, tf.sqrt(l2_regularizer) * eye], axis=-2)
      b_ = distribution_util.pad(
          b, count=num_model_coefficients, axis=-1, back=True)
      # Return l2_regularizer=0 since its now embedded.
      l2_regularizer_ = np.array(0, a.dtype.as_numpy_dtype)
      return a_, b_, l2_regularizer_

    a, b, l2_regularizer = tf.contrib.framework.smart_cond(
        smart_reduce_all([not(fast_unsafe_numerics),
                          l2_regularizer > 0.]),
        _embed_l2_regularization,
        lambda: (a, b, l2_regularizer))

    model_coefficients_next = tf.matrix_solve_ls(
        a, b[..., tf.newaxis],
        fast=fast_unsafe_numerics,
        l2_regularizer=l2_regularizer,
        name='model_coefficients_next')
    model_coefficients_next = model_coefficients_next[..., 0]

    # TODO(b/79122261): The approach used in `matrix_solve_ls` could be made
    # faster by avoiding explicitly forming Q and instead keeping the
    # factorization in 'implicit' form with stacked (rescaled) Householder
    # vectors underneath the 'R' and then applying the (accumulated)
    # reflectors in the appropriate order to apply Q'. However, we don't
    # presently do this because we lack core TF functionality. For reference,
    # the vanilla QR approach is:
    #   q, r = tf.linalg.qr(a)
    #   c = tf.matmul(q, b, adjoint_a=True)
    #   model_coefficients_next = tf.matrix_triangular_solve(
    #       r, c, lower=False, name='model_coefficients_next')

    predicted_linear_response_next = calculate_linear_predictor(
        model_matrix,
        model_coefficients_next,
        offset,
        name='predicted_linear_response_next')

    return model_coefficients_next, predicted_linear_response_next
Example #24
0
def mask_nans(x):
  x_zeros = tf.zeros_like(x)
  x_mask = tf.is_finite(x)
  y = tf.where(x_mask, x, x_zeros)
  return y
Example #25
0
 def process(self, grads):
     for grad, var in grads:
         assert grad is not None, "Grad is None for variable {}".format(var.name)
         # TODO make assert work
         tf.Assert(tf.reduce_all(tf.is_finite(var)), [var])
     return grads
Example #26
0
    def apply_updates(self, allow_no_op: bool = False) -> tf.Operation:
        """Construct training op to update the registered variables based on their gradients."""
        tfutil.assert_tf_initialized()
        assert not self._updates_applied
        self._updates_applied = True
        all_ops = []

        # Check for no-op.
        if allow_no_op and len(self._devices) == 0:
            with tfutil.absolute_name_scope(self.scope):
                return tf.no_op(name='TrainingOp')

        # Clean up gradients.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Clean%d" %
                                            device_idx), tf.device(
                                                device.name):
                for var, grad in device.grad_raw.items():

                    # Filter out disconnected gradients and convert to float32.
                    grad = [g for g in grad if g is not None]
                    grad = [tf.cast(g, tf.float32) for g in grad]

                    # Sum within the device.
                    if len(grad) == 0:
                        grad = tf.zeros(var.shape)  # No gradients => zero.
                    elif len(grad) == 1:
                        grad = grad[0]  # Single gradient => use as is.
                    else:
                        grad = tf.add_n(grad)  # Multiple gradients => sum.

                    # Scale as needed.
                    scale = 1.0 / len(device.grad_raw[var]) / len(
                        self._devices)
                    scale = tf.constant(scale, dtype=tf.float32, name="scale")
                    if self.minibatch_multiplier is not None:
                        scale /= tf.cast(self.minibatch_multiplier, tf.float32)
                    scale = self.undo_loss_scaling(scale)
                    device.grad_clean[var] = grad * scale

        # Sum gradients across devices.
        if len(self._devices) > 1:
            with tfutil.absolute_name_scope(self.scope +
                                            "/Broadcast"), tf.device(None):
                for all_vars in zip(*[
                        device.grad_clean.keys()
                        for device in self._devices.values()
                ]):
                    if len(all_vars) > 0 and all(
                            dim > 0 for dim in all_vars[0].shape.as_list()
                    ):  # NCCL does not support zero-sized tensors.
                        all_grads = [
                            device.grad_clean[var] for device, var in zip(
                                self._devices.values(), all_vars)
                        ]
                        all_grads = nccl_ops.all_sum(all_grads)
                        for device, var, grad in zip(self._devices.values(),
                                                     all_vars, all_grads):
                            device.grad_clean[var] = grad

        # Apply updates separately on each device.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Apply%d" %
                                            device_idx), tf.device(
                                                device.name):
                # pylint: disable=cell-var-from-loop

                # Accumulate gradients over time.
                if self.minibatch_multiplier is None:
                    acc_ok = tf.constant(True, name='acc_ok')
                    device.grad_acc = OrderedDict(device.grad_clean)
                else:
                    # Create variables.
                    with tf.control_dependencies(None):
                        for var in device.grad_clean.keys():
                            device.grad_acc_vars[var] = tf.Variable(
                                tf.zeros(var.shape),
                                trainable=False,
                                name="grad_acc_var")
                        device.grad_acc_count = tf.Variable(
                            tf.zeros([]),
                            trainable=False,
                            name="grad_acc_count")

                    # Track counter.
                    count_cur = device.grad_acc_count + 1.0
                    count_inc_op = lambda: tf.assign(device.grad_acc_count,
                                                     count_cur)
                    count_reset_op = lambda: tf.assign(device.grad_acc_count,
                                                       tf.zeros([]))
                    acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier,
                                                   tf.float32))
                    all_ops.append(
                        tf.cond(acc_ok, count_reset_op, count_inc_op))

                    # Track gradients.
                    for var, grad in device.grad_clean.items():
                        acc_var = device.grad_acc_vars[var]
                        acc_cur = acc_var + grad
                        device.grad_acc[var] = acc_cur
                        with tf.control_dependencies([acc_cur]):
                            acc_inc_op = lambda: tf.assign(acc_var, acc_cur)
                            acc_reset_op = lambda: tf.assign(
                                acc_var, tf.zeros(var.shape))
                            all_ops.append(
                                tf.cond(acc_ok, acc_reset_op, acc_inc_op))

                # No overflow => apply gradients.
                all_ok = tf.reduce_all(
                    tf.stack([acc_ok] + [
                        tf.reduce_all(tf.is_finite(g))
                        for g in device.grad_acc.values()
                    ]))
                apply_op = lambda: device.optimizer.apply_gradients(
                    [(tf.cast(grad, var.dtype), var)
                     for var, grad in device.grad_acc.items()])
                all_ops.append(tf.cond(all_ok, apply_op, tf.no_op))

                # Adjust loss scaling.
                if self.use_loss_scaling:
                    ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var,
                                                      self.loss_scaling_inc)
                    ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var,
                                                      self.loss_scaling_dec)
                    ls_update_op = lambda: tf.group(
                        tf.cond(all_ok, ls_inc_op, ls_dec_op))
                    all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op))

                # Last device => report statistics.
                if device_idx == len(self._devices) - 1:
                    all_ops.append(
                        autosummary.autosummary(self.id + "/learning_rate",
                                                self.learning_rate))
                    all_ops.append(
                        autosummary.autosummary(self.id +
                                                "/overflow_frequency",
                                                tf.where(all_ok, 0, 1),
                                                condition=acc_ok))
                    if self.use_loss_scaling:
                        all_ops.append(
                            autosummary.autosummary(
                                self.id + "/loss_scaling_log2",
                                device.loss_scaling_var))

        # Initialize variables.
        self.reset_optimizer_state()
        if self.use_loss_scaling:
            tfutil.init_uninitialized_vars(
                [device.loss_scaling_var for device in self._devices.values()])
        if self.minibatch_multiplier is not None:
            tfutil.run([
                var.initializer for device in self._devices.values()
                for var in list(device.grad_acc_vars.values()) +
                [device.grad_acc_count]
            ])

        # Group everything into a single op.
        with tfutil.absolute_name_scope(self.scope):
            return tf.group(*all_ops, name="TrainingOp")