Example #1
0
  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    rms = self.get_slot(var, "rms")
    if self._momentum:
      mom = self.get_slot(var, "momentum")
      if self.centered:
        mg = self.get_slot(var, "mg")
        return tf.raw_ops.ResourceSparseApplyCenteredRMSProp(
            var=var.handle,
            mg=mg.handle,
            ms=rms.handle,
            mom=mom.handle,
            lr=coefficients["lr_t"],
            rho=coefficients["rho"],
            momentum=coefficients["momentum"],
            epsilon=coefficients["epsilon"],
            grad=grad,
            indices=indices,
            use_locking=self._use_locking)
      else:
        return tf.raw_ops.ResourceSparseApplyRMSProp(
            var=var.handle,
            ms=rms.handle,
            mom=mom.handle,
            lr=coefficients["lr_t"],
            rho=coefficients["rho"],
            momentum=coefficients["momentum"],
            epsilon=coefficients["epsilon"],
            grad=grad,
            indices=indices,
            use_locking=self._use_locking)
    else:
      rms_scaled_g_values = (grad * grad) * coefficients["one_minus_rho"]
      rms_t = tf.compat.v1.assign(rms, rms * coefficients["rho"],
                               use_locking=self._use_locking)
      with tf.control_dependencies([rms_t]):
        rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values)
        rms_slice = tf.gather(rms_t, indices)
      denom_slice = rms_slice
      if self.centered:
        mg = self.get_slot(var, "mg")
        mg_scaled_g_values = grad * coefficients["one_minus_rho"]
        mg_t = tf.compat.v1.assign(mg, mg * coefficients["rho"],
                                use_locking=self._use_locking)
        with tf.control_dependencies([mg_t]):
          mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values)
          mg_slice = tf.gather(mg_t, indices)
          denom_slice = rms_slice - tf.square(mg_slice)
      var_update = self._resource_scatter_add(
          var, indices, coefficients["neg_lr_t"] * grad / (
              tf.sqrt(denom_slice) + coefficients["epsilon"]))
      if self.centered:
        return tf.group(*[var_update, rms_t, mg_t])
      return tf.group(*[var_update, rms_t])
Example #2
0
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
        m_t = tf.compat.v1.assign(
            m, m * coefficients["beta_1_t"], use_locking=self._use_locking
        )
        with tf.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
        v_t = tf.compat.v1.assign(
            v, v * coefficients["beta_2_t"], use_locking=self._use_locking
        )
        with tf.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

        if not self.amsgrad:
            v_sqrt = tf.sqrt(v_t)
            var_update = tf.compat.v1.assign_sub(
                var,
                coefficients["lr"] * m_t / (v_sqrt + coefficients["epsilon"]),
                use_locking=self._use_locking,
            )
            return tf.group(*[var_update, m_t, v_t])
        else:
            v_hat = self.get_slot(var, "vhat")
            v_hat_t = tf.maximum(v_hat, v_t)
            with tf.control_dependencies([v_hat_t]):
                v_hat_t = tf.compat.v1.assign(
                    v_hat, v_hat_t, use_locking=self._use_locking
                )
            v_hat_sqrt = tf.sqrt(v_hat_t)
            var_update = tf.compat.v1.assign_sub(
                var,
                coefficients["lr"]
                * m_t
                / (v_hat_sqrt + coefficients["epsilon"]),
                use_locking=self._use_locking,
            )
            return tf.group(*[var_update, m_t, v_t, v_hat_t])
    def __init__(self,
                 feature_ndims,
                 dtype=None,
                 name=None,
                 validate_args=False,
                 parameters=None):
        """Construct a PositiveSemidefiniteKernel (subclass) instance.

    Args:
      feature_ndims: Python `integer` indicating the number of dims (the rank)
        of the feature space this kernel acts on.
      dtype: `DType` on which this kernel operates.
      name: Python `str` name prefixed to Ops created by this class. Default:
        subclass name.
      validate_args: Python `bool`, default `False`. When `True` kernel
        parameters are checked for validity despite possibly degrading runtime
        performance. When `False` invalid inputs may silently render incorrect
        outputs.
      parameters: Python `dict` of constructor arguments.

    Raises:
      ValueError: if `feature_ndims` is not an integer greater than 0
    Inputs to PositiveSemidefiniteKernel methods partition into 3 pieces:

    ```none
    [b1, ..., bB, e1, ..., eE, f1, ..., fF]
    '----------'  '---------'  '---------'
         |             |            '-- Feature dimensions
         |             '-- Example dimensions
         '-- Batch dimensions
    ```

    The `feature_ndims` argument declares how many of the right-most shape
    dimensions belong to the feature dimensions. This enables us to predict
    which shape dimensions will be 'reduced' away during kernel computation.
    """
        if not (isinstance(feature_ndims, int) and feature_ndims > 0):
            raise ValueError(
                '`feature_ndims` must be a Python `integer` greater than zero. '
                + 'Got: {}'.format(feature_ndims))
        self._feature_ndims = feature_ndims
        self._dtype = dtype
        if not name or name[-1] != '/':  # `name` is not a name scope
            name = tf.name_scope(name or type(self).__name__).name
        self._name = name
        self._validate_args = validate_args
        if parameters is not None:
            # Ensure no `self` references.
            parameters = {
                k: v
                for k, v in parameters.items()
                if v is not self and not k.startswith('__')
            }
        self._parameters = self._no_dependency(parameters)
        self._initial_parameter_control_dependencies = tuple(
            d for d in self._parameter_control_dependencies(is_init=True)
            if d is not None)
        if self._initial_parameter_control_dependencies:
            self._initial_parameter_control_dependencies = (tf.group(
                *self._initial_parameter_control_dependencies), )
Example #4
0
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype))
                        or self._fallback_apply_state(var_device, var_dtype))

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, 'm')
        m_slice = tf.gather(m, indices, axis=coefficients['zero'])
        m_t_slice = (m_slice * coefficients['beta_1_t'] +
                     grad * coefficients['one_minus_beta_1_t'])
        with tf.control_dependencies([m_t_slice]):
            m_t = self._resource_scatter_update(m, indices, m_t_slice)

        # u_t = max(beta2 * u, abs(g_t))
        v = self.get_slot(var, 'v')
        v_slice = tf.gather(v, indices, axis=coefficients['zero'])
        v_t_slice = tf.maximum(v_slice * coefficients['beta_2_t'],
                               tf.abs(grad))
        with tf.control_dependencies([v_t_slice]):
            v_t = self._resource_scatter_update(v, indices, v_t_slice)
        # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
        var_slice = coefficients['neg_scaled_lr'] * (
            m_t_slice / (v_t_slice + coefficients['epsilon']))
        with tf.control_dependencies([var_slice]):
            var_update = self._resource_scatter_add(var, indices, var_slice)
        return tf.group(*[var_update, m_t, v_t])
  def _apply_gradients_cross_replica(self, distribution, grads_and_vars, name,
                                     experimental_aggregate_gradients):
    grads = [g for g, _ in grads_and_vars]
    if isinstance(self._loss_scale, _DynamicLossScaleState):
      loss_scale_update_op, should_apply_grads = self._loss_scale.update(grads)
    else:
      loss_scale_update_op = tf.no_op()
      should_apply_grads = True

    def apply_fn():
      # We do not want DistributionStrategy to unwrap any MirroredVariables in
      # grads_and_vars, because even in a replica context, the wrapped optimizer
      # expects mirrored variables. So we wrap the variables with an
      # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
      # MirroredVariables.
      wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])
      return distribution.extended.call_for_each_replica(
          self._apply_gradients,
          args=(grads, wrapped_vars, name, experimental_aggregate_gradients))

    def do_not_apply_fn():
      # Normally self._optimizer.iterations is incremented in
      # self._optimizer.apply_gradients(). Since that is not called in this
      # branch, we increment it here instead.
      return self._optimizer.iterations.assign_add(1, read_value=False)

    # Note: We must call this cond() in a cross-replica context.
    # DistributionStrategy does not support having a cond in a replica context
    # with a branch that calls `merge_call`, and self._optimizer.apply_gradients
    # calls `merge_call`.
    maybe_apply_op = smart_cond.smart_cond(should_apply_grads, apply_fn,
                                           do_not_apply_fn)
    return tf.group(maybe_apply_op, loss_scale_update_op)
Example #6
0
  def _train_v1(self, experience, weights):
    with tf.GradientTape() as tape:
      loss_info = self._loss(
          experience,
          td_errors_loss_fn=self._td_errors_loss_fn,
          gamma=self._gamma,
          reward_scale_factor=self._reward_scale_factor,
          weights=weights)
    tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan')
    variables_to_train = self._q_network.trainable_weights
    assert list(variables_to_train), "No variables in the agent's q_network."
    grads = tape.gradient(loss_info.loss, variables_to_train)
    # Tuple is used for py3, where zip is a generator producing values once.
    grads_and_vars = tuple(zip(grads, variables_to_train))
    if self._gradient_clipping is not None:
      grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars,
                                                       self._gradient_clipping)

    if self._summarize_grads_and_vars:
      eager_utils.add_variables_summaries(grads_and_vars,
                                          self.train_step_counter)
      eager_utils.add_gradients_summaries(grads_and_vars,
                                          self.train_step_counter)

    train_op = self._optimizer.apply_gradients(
        grads_and_vars, global_step=self.train_step_counter)

    update_op = self._update_target()
    train_op = tf.group(train_op, update_op)

    return train_op, loss_info
Example #7
0
    def _apply_dense(self, grad, var):
        param_name = self._get_variable_name(var.name)
        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")

        # Standard Adam update.
        next_m = (tf.multiply(self.beta_1, m) +
                  tf.multiply(1.0 - self.beta_1, grad))
        next_v = (tf.multiply(self.beta_2, v) +
                  tf.multiply(1.0 - self.beta_2, tf.square(grad)))

        update = next_m / (tf.sqrt(next_v) + self.epsilon)

        # Just adding the square of the weights to the loss function is *not*
        # the correct way of using L2 regularization/weight decay with Adam,
        # since that will interact with the m and v parameters in strange ways.
        #
        # Instead we want ot decay the weights in a manner that doesn't interact
        # with the m/v parameters. This is equivalent to adding the square
        # of the weights to the loss with plain (non-momentum) SGD.
        if self._do_use_weight_decay(param_name):
            update += self.weight_decay_rate * var

        update_with_lr = self.learning_rate * update

        next_param = var - update_with_lr

        return tf.group(
            [var.assign(next_param),
             m.assign(next_m),
             v.assign(next_v)])
        def update_if_not_finite_grads():
            """Update assuming the gradients are nonfinite."""

            new_loss_scale = tf.maximum(
                self.current_loss_scale / self.multiplier, 1)
            return tf.group(self.counter.assign(0),
                            self.current_loss_scale.assign(new_loss_scale))
def visualize_tfrecords(path_to_tfrecord, num_vids, num_skip_frames):
    """Visualizes TFRecords in given path.

  Args:
    path_to_tfrecord: string, Path to TFRecords. Provide search pattern in
    string.
    num_vids: integer, Number of videos to visualize.
    num_skip_frames: integer, Number of frames to skip while visualzing.
  """
    tfrecord_files = glob.glob(path_to_tfrecord)
    tfrecord_files.sort()
    sess = tf.Session()
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    dataset = tf.data.TFRecordDataset(tfrecord_files)
    dataset = dataset.map(decode)
    dataset = dataset.batch(1)

    iterator = dataset.make_one_shot_iterator()
    next_batch = iterator.get_next()

    sess.run(init_op)

    for _ in xrange(num_vids):

        # Fetch a new batch from the dataset
        batch_videos, batch_names = sess.run(next_batch)
        tf.logging.info('Class label = %d', batch_names[0])
        for frame_idx in xrange(0, len(batch_videos[0]), num_skip_frames):
            plt.imshow(batch_videos[0, frame_idx])
            plt.pause(0.1)
            plt.clf()
Example #10
0
    def add_step(self, env_step: EnvStep):
        tf.nest.assert_same_structure(env_step, self._spec)

        with tf.device(self._device):
            if self._last_step_id >= self._capacity - 1:
                #TODO(ofirnachum): implement circular dataset.
                raise ValueError('Dataset is over capacity.')
            self._last_step_id.assign_add(1)

            if StepType.is_first(env_step.step_type):
                # New episode, increment episode id;
                self._last_episode_id.assign_add(1)

            if self._last_episode_id < 0:
                raise ValueError(
                    'First added step must have type StepType.FIRST.')

            current_episode_info = self._episode_info_table.read(
                self._last_episode_id)

            if StepType.is_first(env_step.step_type):
                # Full episode is just this single step.
                current_episode_info = EpisodeInfo(self._last_step_id,
                                                   self._last_step_id,
                                                   env_step.step_type,
                                                   env_step.step_type)
            else:
                # Update current episode with latest step as the 'end' step.
                current_episode_info = EpisodeInfo(
                    current_episode_info.episode_start_id, self._last_step_id,
                    current_episode_info.episode_start_type,
                    env_step.step_type)

            write_episode_op = self._episode_info_table.write(
                self._last_episode_id, current_episode_info)
            write_step_op = self._data_table.write(self._last_step_id,
                                                   env_step)
            ret_op = tf.group(write_episode_op, write_step_op)

            if not StepType.is_last(env_step.step_type):
                # This new step is valid for sampling.
                self._last_valid_steps_id.assign_add(1)
                write_valid_steps_op = self._valid_steps_table.write(
                    self._last_valid_steps_id, self._last_step_id)
                ret_op = tf.group(ret_op, write_valid_steps_op)

            return ret_op
 def step_fn(ctx, inputs):
   del ctx  # Unused
   fetches = distribution.experimental_local_results(
       distribution.extended.call_for_each_replica(
           model_fn, args=(inputs,)))
   if update_ops_in_cross_replica_mode:
     fetches += tuple(tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS))
   return tf.group(fetches)
 def metric_fn():
     var = tf_compat.v1.get_variable(
         "metric_var",
         shape=[],
         trainable=False,
         initializer=tf_compat.v1.zeros_initializer(),
         collections=[tf_compat.v1.GraphKeys.LOCAL_VARIABLES])
     # A metric with an op that doesn't return a Tensor.
     op = tf.group(tf_compat.v1.assign_add(var, 1))
     return {"operation_metric": (var, op)}
Example #13
0
def variables_load(filename, variables):
    """Assigns values to structure of `tf.Variable`s from `filename`."""
    with np.load(filename) as data:
        vars_ = tf.nest.flatten(variables)
        if len(vars_) != len(data):
            raise ValueError('File "{}" has incorrect number of variables '
                             '(saw: {}, expected: {}).'.format(
                                 filename, len(data), len(vars_)))
        return tf.group(
            [v.assign(x) for v, (_, x) in zip(vars_, list(data.items()))])
    def _resource_apply_dense(self, grad, param, apply_state=None):
        if grad is None or param is None:
            return tf.no_op()

        var_device, var_dtype = param.device, param.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
                        self._fallback_apply_state(var_device, var_dtype))
        learning_rate = coefficients["lr_t"]

        param_name = param.name

        v = self.get_slot(param, "Momentum")

        if self._use_weight_decay(param_name):
            grad += self.weight_decay * param

        if self.classic_momentum:
            trust_ratio = 1.0
            if self._do_layer_adaptation(param_name):
                w_norm = tf.norm(param, ord=2)
                g_norm = tf.norm(grad, ord=2)
                trust_ratio = tf.where(
                    tf.greater(w_norm, 0),
                    tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
                    1.0)
            scaled_lr = learning_rate * trust_ratio

            next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
            if self.use_nesterov:
                update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
            else:
                update = next_v
            next_param = param - update
        else:
            next_v = tf.multiply(self.momentum, v) + grad
            if self.use_nesterov:
                update = tf.multiply(self.momentum, next_v) + grad
            else:
                update = next_v

            trust_ratio = 1.0
            if self._do_layer_adaptation(param_name):
                w_norm = tf.norm(param, ord=2)
                v_norm = tf.norm(update, ord=2)
                trust_ratio = tf.where(
                    tf.greater(w_norm, 0),
                    tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
                    1.0)
            scaled_lr = trust_ratio * learning_rate
            next_param = param - scaled_lr * update

        return tf.group(*[
            param.assign(next_param, use_locking=False),
            v.assign(next_v, use_locking=False)
        ])
Example #15
0
 def _draw_bijector(self, bijector_name, data,
                    batch_shape=None, allowed_bijectors=None,
                    validate_args=True):
   event_dim = data.draw(hps.integers(min_value=2, max_value=6))
   bijector = data.draw(
       bijectors(bijector_name=bijector_name, event_dim=event_dim,
                 enable_vars=True, batch_shape=batch_shape,
                 allowed_bijectors=allowed_bijectors,
                 validate_args=validate_args))
   self.evaluate(tf.group(*[v.initializer for v in bijector.variables]))
   return bijector, event_dim
Example #16
0
 def _validate(self):
     vops = [
         assert_util.assert_positive(self._scale),
         assert_util.assert_positive(self._high - self._low),
         assert_util.assert_finite(self._low,
                                   message="Lower bound not finite"),
         assert_util.assert_finite(self._high,
                                   message="Upper bound not finite"),
         assert_util.assert_finite(self._loc, message="Loc not finite"),
         assert_util.assert_finite(self._scale, message="scale not finite"),
     ]
     return tf.group(*vops, name="ValidationOps")
Example #17
0
    def _apply_sparse_shared(self, grad, indices, var, scatter_update_fn):
        """Applies sparse gradients to a variable.

    Args:
      grad: A tensor for the `values` of `tf.IndexedSlices`.
      indices: A tensor for the `indices` of `tf.IndexedSlices`.
      var: A `tf.Variable` object.
      scatter_update_fn: A function which performs scattered update to
        a `tf.Variable` object. It takes tuple of (x, i, v) where:
          * x: A `tf.Variable` object which is updated by `i` and `v`,
          * i: A tensor for the `indices` of `tf.IndexedSlices`,
          * v: A tensor for the `values` of `tf.IndexedSlices`,
        and returns a tensor after updating `x`.

    Returns:
      An op which updates `var` with `grad` and `indices`.
    """
        param_name = self._get_variable_name(var.name)
        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")

        # m_t = beta1 * m + (1 - beta1) * g_t
        m_scaled_g_values = tf.multiply(1.0 - self.beta_1, grad)
        m_t = m.assign(m * self.beta_1)
        with tf.control_dependencies([m_t]):
            m_slice = tf.gather(m, indices) + m_scaled_g_values
            m_t = scatter_update_fn(m, indices, m_slice)

        # v_t = beta2 * v + (1 - beta2) * g_t^2
        v_scaled_g_values = tf.multiply(1.0 - self.beta_2, tf.square(grad))
        v_t = v.assign(v * self.beta_2)
        with tf.control_dependencies([v_t]):
            v_slice = tf.gather(v, indices) + v_scaled_g_values
            v_t = scatter_update_fn(v, indices, v_slice)

        update = m_t / (tf.sqrt(v_t) + self.epsilon)

        # Just adding the square of the weights to the loss function is *not*
        # the correct way of using L2 regularization/weight decay with Adam,
        # since that will interact with the m and v parameters in strange ways.
        #
        # Instead we want ot decay the weights in a manner that doesn't interact
        # with the m/v parameters. This is equivalent to adding the square
        # of the weights to the loss with plain (non-momentum) SGD.
        if self._do_use_weight_decay(param_name):
            update += self.weight_decay_rate * var

        update_with_lr = self.learning_rate * update

        next_param = var - update_with_lr

        return tf.group([var.assign(next_param), m_t, v_t])
        def _best_eval_metrics_fn(*args):
            """Returns the best eval metrics."""

            with tf_compat.v1.variable_scope("best_eval_metrics"):
                args = list(args)
                idx, idx_update_op = tf_compat.v1.metrics.mean(args.pop())
                idx = tf.cast(idx, tf.int32)
                metric_fns = self._candidates_eval_metrics_store.metric_fns
                metric_fn_args = self._candidates_eval_metrics_store.pack_args(
                    args[:len(candidate_args)])
                candidate_grouped_metrics = self._group_metric_ops(
                    metric_fns, metric_fn_args)

                metric_fns = self._subnetworks_eval_metrics_store.metric_fns
                metric_fn_args = self._subnetworks_eval_metrics_store.pack_args(
                    args[(len(args) - len(subnetwork_args)):])
                subnetwork_grouped_metrics = self._group_metric_ops(
                    metric_fns, metric_fn_args)

                eval_metric_ops = {}
                for metric_name in sorted(candidate_grouped_metrics):
                    metric_ops = candidate_grouped_metrics[metric_name]
                    if len(metric_ops) != len(self._candidates):
                        continue
                    if metric_name == "loss":
                        continue
                    values, ops = list(six.moves.zip(*metric_ops))
                    best_value = tf.stack(values)[idx]
                    # All tensors in this function have been outfed from the TPU, so we
                    # must update them manually, otherwise the TPU will hang indefinitely
                    # for the value of idx to update.
                    ops = list(ops)
                    ops.append(idx_update_op)
                    # Bundle subnetwork eval metric ops and ensemble "loss"" ops (which
                    # is a restricted Estimator keyword) into other metric ops so that
                    # they are computed.
                    ensemble_loss_ops = candidate_grouped_metrics.get(
                        "loss", tf.no_op())
                    all_ops = tf.group(ops, ensemble_loss_ops,
                                       subnetwork_grouped_metrics)
                    eval_metric_ops[metric_name] = (best_value, all_ops)
                iteration_number = tf.constant(self._iteration_number)
                eval_metric_ops["iteration"] = (iteration_number,
                                                iteration_number)

                if self._replay_indices_for_all:
                    _replay_eval_metrics(idx, eval_metric_ops)

                # tf.estimator.Estimator does not allow a "loss" key to be present in
                # its eval_metrics.
                assert "loss" not in eval_metric_ops
                return eval_metric_ops
Example #19
0
def _merge_train_op_list(train_op_list, no_train_speedup):
    """Merges a train_op list into one train_op."""
    if not train_op_list:
        global_step = tf.compat.v1.train.get_or_create_global_step()
        # Train is 100 times faster. No parameters to train here.
        train_op = tf.compat.v1.assign(global_step,
                                       global_step + no_train_speedup)
    elif len(train_op_list) == 1:
        train_op = train_op_list[0]
    else:
        train_op = tf.group(*train_op_list)

    return train_op
Example #20
0
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)

        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")

        g_prime = grad / coefficients["one_minus_m_schedule_new"]

        # m_t = beta1 * m + (1 - beta1) * g_t
        m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
        m_t = tf.compat.v1.assign(
            m, m * coefficients["beta_1_t"], use_locking=self._use_locking
        )

        with tf.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
            m_t_slice = tf.gather(m_t, indices)

        m_t_prime = m_t_slice / coefficients["one_minus_m_schedule_next"]
        m_t_bar = (
            coefficients["one_minus_m_t"] * g_prime
            + coefficients["m_t_1"] * m_t_prime
        )

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
        v_t = tf.compat.v1.assign(
            v, v * coefficients["beta_2_t"], use_locking=self._use_locking
        )

        with tf.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
            v_t_slice = tf.gather(v_t, indices)

        v_t_prime = v_t_slice / coefficients["v_t_prime_denominator"]
        v_prime_sqrt_plus_eps = tf.sqrt(v_t_prime) + coefficients["epsilon"]

        var_update = self._resource_scatter_add(
            var,
            indices,
            coefficients["neg_lr_t"] * m_t_bar / v_prime_sqrt_plus_eps,
        )
        return tf.group(*[var_update, m_t_bar, v_t])
Example #21
0
    def _resource_apply_dense(self, grad, handle):
        var = handle
        m = self.get_slot(var, "m")

        if len(var.shape) < self._mindim:
            return tf.group(*[var, m])
        lr_t = tf1.cast(self._lr_t, var.dtype.base_dtype)
        momentum_t = tf1.cast(self._momentum_t, var.dtype.base_dtype)

        scale = tf1.sqrt(tf1.reduce_sum(var**2))
        dscale = tf1.sign(tf1.reduce_sum(var * grad) / (scale + 1e-12))

        m_t = m.assign(momentum_t * m - lr_t * dscale)

        new_scale = scale + m_t
        var_update = tf1.assign(var, var * new_scale / (scale + 1e-12))
        return tf1.group(*[var_update, m_t])
            def _apply_gradients_cross_replica(distribution, grads,
                                               wrapped_vars, name):
                loss_scale_update_op, should_apply_grads = _if_should_apply_grads(
                    grads)

                def apply_fn():
                    return distribution.extended.call_for_each_replica(
                        self._apply_gradients,
                        args=(grads, wrapped_vars, name))

                # Note: We must call this cond() in a cross-replica context.
                # DistributionStrategy does not support having a cond in a replica
                # context with a branch that calls `merge_call`, and
                # self._optimizer.apply_gradients calls `merge_call`.
                maybe_apply_op = tf.__internal__.smart_cond.smart_cond(
                    should_apply_grads, apply_fn, do_not_apply_fn)
                return tf.group(maybe_apply_op, loss_scale_update_op)
Example #23
0
    def train_alpha(self, initial_env_step: dataset_lib.EnvStep,
                    experience: dataset_lib.EnvStep,
                    target_policy: tf_policy.TFPolicy):
        """Solves for data sample weights.

    Args:
      initial_env_step: Initial step.
      experience: Experience step.
      target_policy: The policy whose value we want to estimate.

    Returns:
      data sample weights.
    """
        env_step = tf.nest.map_structure(lambda t: t[:, 0, ...], experience)
        next_env_step = tf.nest.map_structure(lambda t: t[:, 1, ...],
                                              experience)

        with tf.GradientTape(watch_accessed_variables=False,
                             persistent=True) as tape:
            tape.watch([self._alpha])
            nu_loss = self._get_nu_loss(initial_env_step, env_step,
                                        next_env_step, target_policy)
            weights, log_weights = self._get_weights(nu_loss)
            divergence = self._compute_divergence(weights, log_weights)
            divergence_violation = divergence - self._two_sided_limit

            alpha_loss = (-tf.exp(self._alpha) *
                          tf.stop_gradient(divergence_violation))

        alpha_grads = tape.gradient(alpha_loss, [self._alpha])
        alpha_grad_op = self._alpha_optimizer.apply_gradients(
            zip(alpha_grads, [self._alpha]))

        for idx in range(self._num_limits):
            tf.summary.scalar('exp_alpha%d' % idx, tf.exp(self._alpha[idx]))

        return ((tf.reduce_mean(nu_loss, 0), alpha_loss, divergence),
                tf.group(alpha_grad_op))
Example #24
0
  def __init__(self,
               graph_parents=None,
               is_constant_jacobian=False,
               validate_args=False,
               dtype=None,
               forward_min_event_ndims=None,
               inverse_min_event_ndims=None,
               parameters=None,
               name=None):
    """Constructs Bijector.

    A `Bijector` transforms random variables into new random variables.

    Examples:

    ```python
    # Create the Y = g(X) = X transform.
    identity = Identity()

    # Create the Y = g(X) = exp(X) transform.
    exp = Exp()
    ```

    See `Bijector` subclass docstring for more details and specific examples.

    Args:
      graph_parents: Python list of graph prerequisites of this `Bijector`.
      is_constant_jacobian: Python `bool` indicating that the Jacobian matrix is
        not a function of the input.
      validate_args: Python `bool`, default `False`. Whether to validate input
        with asserts. If `validate_args` is `False`, and the inputs are invalid,
        correct behavior is not guaranteed.
      dtype: `tf.dtype` supported by this `Bijector`. `None` means dtype is not
        enforced.
      forward_min_event_ndims: Python `integer` indicating the minimum number of
        dimensions `forward` operates on.
      inverse_min_event_ndims: Python `integer` indicating the minimum number of
        dimensions `inverse` operates on. Will be set to
        `forward_min_event_ndims` by default, if no value is provided.
      parameters: Python `dict` of parameters used to instantiate this
        `Bijector`.
      name: The name to give Ops created by the initializer.

    Raises:
      ValueError:  If neither `forward_min_event_ndims` and
        `inverse_min_event_ndims` are specified, or if either of them is
        negative.
      ValueError:  If a member of `graph_parents` is not a `Tensor`.
    """
    if not name:
      name = type(self).__name__
      name = name_util.camel_to_lower_snake(name)
    name = name_util.get_name_scope_name(name)
    name = name_util.strip_invalid_chars(name)
    super(Bijector, self).__init__(name=name)
    self._name = name
    self._parameters = self._no_dependency(parameters)

    self._graph_parents = self._no_dependency(graph_parents or [])

    self._is_constant_jacobian = is_constant_jacobian
    self._validate_args = validate_args
    self._dtype = dtype

    self._initial_parameter_control_dependencies = tuple(
        d for d in self._parameter_control_dependencies(is_init=True)
        if d is not None)
    if self._initial_parameter_control_dependencies:
      self._initial_parameter_control_dependencies = (
          tf.group(*self._initial_parameter_control_dependencies),)

    if forward_min_event_ndims is None and inverse_min_event_ndims is None:
      raise ValueError('Must specify at least one of `forward_min_event_ndims` '
                       'and `inverse_min_event_ndims`.')
    elif inverse_min_event_ndims is None:
      inverse_min_event_ndims = forward_min_event_ndims
    elif forward_min_event_ndims is None:
      forward_min_event_ndims = inverse_min_event_ndims

    if not isinstance(forward_min_event_ndims, int):
      raise TypeError('Expected forward_min_event_ndims to be of '
                      'type int, got {}'.format(
                          type(forward_min_event_ndims).__name__))

    if not isinstance(inverse_min_event_ndims, int):
      raise TypeError('Expected inverse_min_event_ndims to be of '
                      'type int, got {}'.format(
                          type(inverse_min_event_ndims).__name__))

    if forward_min_event_ndims < 0:
      raise ValueError('forward_min_event_ndims must be a non-negative '
                       'integer.')
    if inverse_min_event_ndims < 0:
      raise ValueError('inverse_min_event_ndims must be a non-negative '
                       'integer.')

    self._forward_min_event_ndims = forward_min_event_ndims
    self._inverse_min_event_ndims = inverse_min_event_ndims

    for i, t in enumerate(self._graph_parents):
      if t is None or not tf.is_tensor(t):
        raise ValueError('Graph parent item %d is not a Tensor; %s.' % (i, t))

    # Setup caching after everything else is done.
    self._cache = self._setup_cache()
Example #25
0
def update_confusion_matrix_variables(
    variables_to_update,
    y_true,
    y_pred,
    thresholds,
    top_k=None,
    class_id=None,
    sample_weight=None,
    multi_label=False,
    label_weights=None,
    thresholds_distributed_evenly=False,
):
    """Returns op to update the given confusion matrix variables.

    For every pair of values in y_true and y_pred:

    true_positive: y_true == True and y_pred > thresholds
    false_negatives: y_true == True and y_pred <= thresholds
    true_negatives: y_true == False and y_pred <= thresholds
    false_positive: y_true == False and y_pred > thresholds

    The results will be weighted and added together. When multiple thresholds are
    provided, we will repeat the same for every threshold.

    For estimation of these metrics over a stream of data, the function creates an
    `update_op` operation that updates the given variables.

    If `sample_weight` is `None`, weights default to 1.
    Use weights of 0 to mask values.

    Args:
      variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
        and corresponding variables to update as values.
      y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
      y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
        the range `[0, 1]`.
      thresholds: A float value, float tensor, python list, or tuple of float
        thresholds in `[0, 1]`, or NEG_INF (used when top_k is set).
      top_k: Optional int, indicates that the positive labels should be limited to
        the top k predictions.
      class_id: Optional int, limits the prediction and labels to the class
        specified by this argument.
      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
        `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
        be either `1`, or the same as the corresponding `y_true` dimension).
      multi_label: Optional boolean indicating whether multidimensional
        prediction/labels should be treated as multilabel responses, or flattened
        into a single label. When True, the valus of `variables_to_update` must
        have a second dimension equal to the number of labels in y_true and
        y_pred, and those tensors must not be RaggedTensors.
      label_weights: (optional) tensor of non-negative weights for multilabel
        data. The weights are applied when calculating TP, FP, FN, and TN without
        explicit multilabel handling (i.e. when the data is to be flattened).
      thresholds_distributed_evenly: Boolean, whether the thresholds are evenly
        distributed within the list. An optimized method will be used if this is
        the case. See _update_confusion_matrix_variables_optimized() for more
        details.

    Returns:
      Update op.

    Raises:
      ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
        `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
        `variables_to_update` contains invalid keys.
    """
    if multi_label and label_weights is not None:
        raise ValueError(
            "`label_weights` for multilabel data should be handled "
            "outside of `update_confusion_matrix_variables` when "
            "`multi_label` is True.")
    if variables_to_update is None:
        return
    if not any(key
               for key in variables_to_update if key in list(ConfusionMatrix)):
        raise ValueError(
            "Please provide at least one valid confusion matrix "
            "variable to update. Valid variable key options are: "
            f'"{list(ConfusionMatrix)}". Received: "{variables_to_update.keys()}"'
        )

    variable_dtype = list(variables_to_update.values())[0].dtype

    y_true = tf.cast(y_true, dtype=variable_dtype)
    y_pred = tf.cast(y_pred, dtype=variable_dtype)

    if thresholds_distributed_evenly:
        # Check whether the thresholds has any leading or tailing epsilon added
        # for floating point imprecision. The leading and tailing threshold will be
        # handled bit differently as the corner case.
        # At this point, thresholds should be a list/array with more than 2 items,
        # and ranged between [0, 1]. See is_evenly_distributed_thresholds() for more
        # details.
        thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0

    thresholds = tf.convert_to_tensor(thresholds, dtype=variable_dtype)
    num_thresholds = thresholds.shape.as_list()[0]

    if multi_label:
        one_thresh = tf.equal(
            tf.cast(1, dtype=tf.int32),
            tf.rank(thresholds),
            name="one_set_of_thresholds_cond",
        )
    else:
        [y_pred, y_true
         ], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true],
                                                             sample_weight)
        one_thresh = tf.cast(True, dtype=tf.bool)

    invalid_keys = [
        key for key in variables_to_update if key not in list(ConfusionMatrix)
    ]
    if invalid_keys:
        raise ValueError(
            f'Invalid keys: "{invalid_keys}". '
            f'Valid variable key options are: "{list(ConfusionMatrix)}"')

    if sample_weight is None:
        y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
            y_pred, y_true)
    else:
        sample_weight = tf.cast(sample_weight, dtype=variable_dtype)
        (
            y_pred,
            y_true,
            sample_weight,
        ) = losses_utils.squeeze_or_expand_dimensions(
            y_pred, y_true, sample_weight=sample_weight)
    y_pred.shape.assert_is_compatible_with(y_true.shape)

    if top_k is not None:
        y_pred = _filter_top_k(y_pred, top_k)
    if class_id is not None:
        y_true = y_true[..., class_id]
        y_pred = y_pred[..., class_id]

    if thresholds_distributed_evenly:
        return _update_confusion_matrix_variables_optimized(
            variables_to_update,
            y_true,
            y_pred,
            thresholds,
            multi_label=multi_label,
            sample_weights=sample_weight,
            label_weights=label_weights,
            thresholds_with_epsilon=thresholds_with_epsilon,
        )

    pred_shape = tf.shape(y_pred)
    num_predictions = pred_shape[0]
    if y_pred.shape.ndims == 1:
        num_labels = 1
    else:
        num_labels = tf.math.reduce_prod(pred_shape[1:], axis=0)
    thresh_label_tile = tf.where(one_thresh, num_labels,
                                 tf.ones([], dtype=tf.int32))

    # Reshape predictions and labels, adding a dim for thresholding.
    if multi_label:
        predictions_extra_dim = tf.expand_dims(y_pred, 0)
        labels_extra_dim = tf.expand_dims(tf.cast(y_true, dtype=tf.bool), 0)
    else:
        # Flatten predictions and labels when not multilabel.
        predictions_extra_dim = tf.reshape(y_pred, [1, -1])
        labels_extra_dim = tf.reshape(tf.cast(y_true, dtype=tf.bool), [1, -1])

    # Tile the thresholds for every prediction.
    if multi_label:
        thresh_pretile_shape = [num_thresholds, 1, -1]
        thresh_tiles = [1, num_predictions, thresh_label_tile]
        data_tiles = [num_thresholds, 1, 1]
    else:
        thresh_pretile_shape = [num_thresholds, -1]
        thresh_tiles = [1, num_predictions * num_labels]
        data_tiles = [num_thresholds, 1]

    thresh_tiled = tf.tile(tf.reshape(thresholds, thresh_pretile_shape),
                           tf.stack(thresh_tiles))

    # Tile the predictions for every threshold.
    preds_tiled = tf.tile(predictions_extra_dim, data_tiles)

    # Compare predictions and threshold.
    pred_is_pos = tf.greater(preds_tiled, thresh_tiled)

    # Tile labels by number of thresholds
    label_is_pos = tf.tile(labels_extra_dim, data_tiles)

    if sample_weight is not None:
        sample_weight = tf.__internal__.ops.broadcast_weights(
            tf.cast(sample_weight, dtype=variable_dtype), y_pred)
        weights_tiled = tf.tile(tf.reshape(sample_weight, thresh_tiles),
                                data_tiles)
    else:
        weights_tiled = None

    if label_weights is not None and not multi_label:
        label_weights = tf.expand_dims(label_weights, 0)
        label_weights = tf.__internal__.ops.broadcast_weights(
            label_weights, y_pred)
        label_weights_tiled = tf.tile(tf.reshape(label_weights, thresh_tiles),
                                      data_tiles)
        if weights_tiled is None:
            weights_tiled = label_weights_tiled
        else:
            weights_tiled = tf.multiply(weights_tiled, label_weights_tiled)

    update_ops = []

    def weighted_assign_add(label, pred, weights, var):
        label_and_pred = tf.cast(tf.logical_and(label, pred), dtype=var.dtype)
        if weights is not None:
            label_and_pred *= tf.cast(weights, dtype=var.dtype)
        return var.assign_add(tf.reduce_sum(label_and_pred, 1))

    loop_vars = {
        ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
    }
    update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
    update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
    update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update

    if update_fn or update_tn:
        pred_is_neg = tf.logical_not(pred_is_pos)
        loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos,
                                                      pred_is_neg)

    if update_fp or update_tn:
        label_is_neg = tf.logical_not(label_is_pos)
        loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg,
                                                      pred_is_pos)
        if update_tn:
            loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (
                label_is_neg,
                pred_is_neg,
            )

    for matrix_cond, (label, pred) in loop_vars.items():

        if matrix_cond in variables_to_update:
            update_ops.append(
                weighted_assign_add(label, pred, weights_tiled,
                                    variables_to_update[matrix_cond]))

    return tf.group(update_ops)
Example #26
0
    def testParameterProperties(self, bijector_name, data):
        if tf.config.functions_run_eagerly() or not tf.executing_eagerly():
            self.skipTest(
                'To reduce test weight, parameter properties tests run in '
                'eager mode only.')

        non_trainable_params = (
            'bijector',  # Several.
            'forward_fn',  # Inline.
            'inverse_fn',  # Inline.
            'forward_min_event_ndims',  # Inline.
            'inverse_min_event_ndims',  # Inline.
            'event_shape_out',  # Reshape.
            'event_shape_in',  # Reshape.
            'perm',  # Transpose.
            'rightmost_transposed_ndims',  # Transpose.
            'diag_bijector',  # TransformDiagonal.
            'diag_shift'  # FillScaleTriL (doesn't support batch shape).
        )
        bijector, event_dim = self._draw_bijector(
            bijector_name,
            data,
            validate_args=True,
            allowed_bijectors=bhps.INSTANTIABLE_BIJECTORS)

        # Extract the full shape of an output from this bijector.
        xs = self._draw_domain_tensor(bijector, data, event_dim)
        ys = bijector.forward(xs)
        output_shape = ps.shape(ys)
        sample_and_batch_ndims = (ps.rank_from_shape(output_shape) -
                                  bijector.inverse_min_event_ndims)

        try:
            params = type(bijector).parameter_properties()
            params64 = type(bijector).parameter_properties(dtype=tf.float64)
        except NotImplementedError as e:
            self.skipTest(str(e))

        seeds = samplers.split_seed(test_util.test_seed(), n=len(params))
        new_parameters = {}
        for i, (param_name, param) in enumerate(params.items()):
            if param_name in non_trainable_params:
                continue

            # Check that the shape_fn is consistent with event_ndims.
            try:
                param_shape = param.shape_fn(sample_shape=output_shape)
            except NotImplementedError:
                self.skipTest('No shape function implemented for bijector {} '
                              'parameter {}.'.format(bijector_name,
                                                     param_name))
            self.assertGreaterEqual(
                param.event_ndims,
                ps.rank_from_shape(param_shape) - sample_and_batch_ndims)

            if param.is_preferred:
                try:
                    param_bijector = param.default_constraining_bijector_fn()
                except NotImplementedError:
                    self.skipTest(
                        'No constraining bijector implemented for {} '
                        'parameter {}.'.format(bijector_name, param_name))
                unconstrained_shape = (
                    param_bijector.inverse_event_shape_tensor(param_shape))
                unconstrained_param = samplers.normal(unconstrained_shape,
                                                      seed=seeds[i])
                new_parameters[param_name] = param_bijector.forward(
                    unconstrained_param)

                # Check that passing a float64 `eps` works with float64 parameters.
                b_float64 = params64[
                    param_name].default_constraining_bijector_fn()
                b_float64(tf.cast(unconstrained_param, tf.float64))

        # Copy over any non-trainable parameters.
        new_parameters.update({
            k: v
            for (k, v) in bijector.parameters.items()
            if k in non_trainable_params
        })

        # Sanity check that we got valid parameters.
        new_parameters['validate_args'] = True
        new_bijector = type(bijector)(**new_parameters)
        self.evaluate(
            tf.group(*[v.initializer for v in new_bijector.variables]))
        xs = self._draw_domain_tensor(new_bijector, data, event_dim)
        self.evaluate(new_bijector.forward(xs))
Example #27
0
    def testNamingWithOptimizer(self):
        input_value = tf.constant([[3.]])
        model = MyModel()
        # A nuisance Model using the same optimizer. Its slot variables should not
        # go in the checkpoint, since it is never depended on.
        other_model = MyModel()
        optimizer = adam.Adam(0.001)
        step = tf.compat.v1.train.get_or_create_global_step()
        root_trackable = tf.train.Checkpoint(optimizer=optimizer,
                                             model=model,
                                             step=step)

        with tf.GradientTape() as tape:
            loss = model(input_value)
        variables = model.trainable_variables
        gradients = tape.gradient(loss, variables)
        train_op = tf.group(
            optimizer.apply_gradients(zip(gradients, variables)),
            step.assign_add(1))

        with tf.GradientTape() as tape:
            loss = other_model(input_value)
        variables = other_model.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

        self.evaluate(trackable_utils.gather_initializers(root_trackable))
        self.evaluate(train_op)
        named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
            root_trackable).serialize_object_graph()
        expected_slot_keys = (
            "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
            "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
            "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
            "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
            "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
        )
        expected_checkpoint_names = (
            # Created in the root node, so no prefix.
            "step",
            "model/_second/kernel",
            "model/_named_dense/kernel",
            "model/_named_dense/bias",
            # non-Layer dependency of the model
            "model/_non_layer/a_variable",
            "optimizer/learning_rate",
            "optimizer/beta_1",
            "optimizer/beta_2",
            "optimizer/iter",
            "optimizer/decay",
        ) + expected_slot_keys
        suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
        expected_checkpoint_names = [
            name + suffix for name in expected_checkpoint_names
        ]
        named_variables = {v.name: v for v in named_variables}
        self.assertEqual(len(expected_checkpoint_names),
                         len(named_variables.keys()))
        # Check that we've mapped to the right variable objects (not exhaustive)
        self.assertEqual("global_step",
                         named_variables["step" + suffix].full_name)
        self.assertEqual(
            "my_model/dense_1/kernel",
            named_variables["model/_second/kernel" + suffix].full_name)
        self.assertEqual(
            "my_model/dense/kernel",
            named_variables["model/_named_dense/kernel" + suffix].full_name)
        self.assertEqual(
            "Adam/beta_1",
            named_variables["optimizer/beta_1" + suffix].full_name)
        self.assertEqual(
            "Adam/beta_2",
            named_variables["optimizer/beta_2" + suffix].full_name)
        # Spot check the generated protocol buffers.
        self.assertEqual("optimizer",
                         serialized_graph.nodes[0].children[1].local_name)
        optimizer_node = serialized_graph.nodes[
            serialized_graph.nodes[0].children[1].node_id]
        children = [node.local_name for node in optimizer_node.children]
        self.assertEqual(
            # hyper variable dependencies
            len(["beta_1", "beta_2", "iter", "decay", "learning_rate"]),
            len(children))
        serialized_slot_keys = []
        for slot in optimizer_node.slot_variables:
            for attribute in (serialized_graph.nodes[
                    slot.slot_variable_node_id].attributes):
                serialized_slot_keys.append(attribute.checkpoint_key)
        self.assertEqual(len([key + suffix for key in expected_slot_keys]),
                         len(serialized_slot_keys))
 def incr_loss_scale():
     new_loss_scale = self.current_loss_scale * self.multiplier
     return tf.group(
         _assign_if_finite(self.current_loss_scale, new_loss_scale),
         self.counter.assign(0))
    def apply_gradients(self,
                        grads_and_vars,
                        name=None,
                        experimental_aggregate_gradients=True):
        if tf.distribute.in_cross_replica_context():
            raise ValueError(
                'apply_gradients() must be called in a replica context.')
        # We check for the strategy here despite already checking in the constructor
        # as frequently the optimizer is created outside the strategy's scope.
        self._raise_if_strategy_unsupported()

        grads_and_vars = optimizer_utils.filter_empty_gradients(grads_and_vars)
        if experimental_aggregate_gradients:
            # We must aggregate the gradients here instead of in
            # self.optimizer.apply_gradients, so that any NaN or Inf gradients are
            # propagated to each replica. If any replica has a NaN or Inf gradient,
            # they must all have a NaN or Inf gradient so that they all skip the step.
            # pylint: disable=protected-access
            grads_and_vars = self._optimizer._transform_unaggregated_gradients(
                grads_and_vars)
            grads_and_vars = self._optimizer._aggregate_gradients(
                grads_and_vars)
            # pylint: enable=protected-access

        grads_and_vars = tuple(grads_and_vars)
        grads = [g for g, _ in grads_and_vars]
        # We do not want DistributionStrategy to unwrap any MirroredVariables in
        # grads_and_vars, because even in a replica context, the wrapped
        # optimizer expects mirrored variables. So we wrap the variables with an
        # _UnwrapPreventer, preventing DistributionStrategy from unwrapping the
        # MirroredVariables.
        wrapped_vars = _UnwrapPreventer([v for _, v in grads_and_vars])

        def do_not_apply_fn():
            # Normally self._optimizer.iterations is incremented in
            # self._optimizer.apply_gradients(). Since that is not called in this
            # branch, we increment it here instead.
            return self._optimizer.iterations.assign_add(1, read_value=False)

        def _if_should_apply_grads(grads):
            if isinstance(self._loss_scale, _DynamicLossScaleState):
                return self._loss_scale.update(grads)
            else:
                return (tf.no_op(), True)

        if tf.__internal__.distribute.strategy_supports_no_merge_call():
            loss_scale_update_op, should_apply_grads = _if_should_apply_grads(
                grads)

            def apply_fn():
                return self._apply_gradients(grads, wrapped_vars, name)

            maybe_apply_op = tf.__internal__.smart_cond.smart_cond(
                should_apply_grads, apply_fn, do_not_apply_fn)
            return tf.group(maybe_apply_op, loss_scale_update_op)

        else:

            def _apply_gradients_cross_replica(distribution, grads,
                                               wrapped_vars, name):
                loss_scale_update_op, should_apply_grads = _if_should_apply_grads(
                    grads)

                def apply_fn():
                    return distribution.extended.call_for_each_replica(
                        self._apply_gradients,
                        args=(grads, wrapped_vars, name))

                # Note: We must call this cond() in a cross-replica context.
                # DistributionStrategy does not support having a cond in a replica
                # context with a branch that calls `merge_call`, and
                # self._optimizer.apply_gradients calls `merge_call`.
                maybe_apply_op = tf.__internal__.smart_cond.smart_cond(
                    should_apply_grads, apply_fn, do_not_apply_fn)
                return tf.group(maybe_apply_op, loss_scale_update_op)

            return tf.distribute.get_replica_context().merge_call(
                _apply_gradients_cross_replica,
                args=(grads, wrapped_vars, name))
Example #30
0
def _update_confusion_matrix_variables_optimized(
    variables_to_update,
    y_true,
    y_pred,
    thresholds,
    multi_label=False,
    sample_weights=None,
    label_weights=None,
    thresholds_with_epsilon=False,
):
    """Update confusion matrix variables with memory efficient alternative.

    Note that the thresholds need to be evenly distributed within the list, eg,
    the diff between consecutive elements are the same.

    To compute TP/FP/TN/FN, we are measuring a binary classifier
      C(t) = (predictions >= t)
    at each threshold 't'. So we have
      TP(t) = sum( C(t) * true_labels )
      FP(t) = sum( C(t) * false_labels )

    But, computing C(t) requires computation for each t. To make it fast,
    observe that C(t) is a cumulative integral, and so if we have
      thresholds = [t_0, ..., t_{n-1}];  t_0 < ... < t_{n-1}
    where n = num_thresholds, and if we can compute the bucket function
      B(i) = Sum( (predictions == t), t_i <= t < t{i+1} )
    then we get
      C(t_i) = sum( B(j), j >= i )
    which is the reversed cumulative sum in tf.cumsum().

    We can compute B(i) efficiently by taking advantage of the fact that
    our thresholds are evenly distributed, in that
      width = 1.0 / (num_thresholds - 1)
      thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0]
    Given a prediction value p, we can map it to its bucket by
      bucket_index(p) = floor( p * (num_thresholds - 1) )
    so we can use tf.math.unsorted_segment_sum() to update the buckets in one
    pass.

    Consider following example:
    y_true = [0, 0, 1, 1]
    y_pred = [0.1, 0.5, 0.3, 0.9]
    thresholds = [0.0, 0.5, 1.0]
    num_buckets = 2   # [0.0, 1.0], (1.0, 2.0]
    bucket_index(y_pred) = tf.math.floor(y_pred * num_buckets)
                         = tf.math.floor([0.2, 1.0, 0.6, 1.8])
                         = [0, 0, 0, 1]
    # The meaning of this bucket is that if any of the label is true,
    # then 1 will be added to the corresponding bucket with the index.
    # Eg, if the label for 0.2 is true, then 1 will be added to bucket 0. If the
    # label for 1.8 is true, then 1 will be added to bucket 1.
    #
    # Note the second item "1.0" is floored to 0, since the value need to be
    # strictly larger than the bucket lower bound.
    # In the implementation, we use tf.math.ceil() - 1 to achieve this.
    tp_bucket_value = tf.math.unsorted_segment_sum(true_labels, bucket_indices,
                                                   num_segments=num_thresholds)
                    = [1, 1, 0]
    # For [1, 1, 0] here, it means there is 1 true value contributed by bucket 0,
    # and 1 value contributed by bucket 1. When we aggregate them to together,
    # the result become [a + b + c, b + c, c], since large thresholds will always
    # contribute to the value for smaller thresholds.
    true_positive = tf.math.cumsum(tp_bucket_value, reverse=True)
                  = [2, 1, 0]

    This implementation exhibits a run time and space complexity of O(T + N),
    where T is the number of thresholds and N is the size of predictions.
    Metrics that rely on standard implementation instead exhibit a complexity of
    O(T * N).

    Args:
      variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
        and corresponding variables to update as values.
      y_true: A floating point `Tensor` whose shape matches `y_pred`. Will be cast
        to `bool`.
      y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
        the range `[0, 1]`.
      thresholds: A sorted floating point `Tensor` with value in `[0, 1]`.
        It need to be evenly distributed (the diff between each element need to be
        the same).
      multi_label: Optional boolean indicating whether multidimensional
        prediction/labels should be treated as multilabel responses, or flattened
        into a single label. When True, the valus of `variables_to_update` must
        have a second dimension equal to the number of labels in y_true and
        y_pred, and those tensors must not be RaggedTensors.
      sample_weights: Optional `Tensor` whose rank is either 0, or the same rank
        as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions
        must be either `1`, or the same as the corresponding `y_true` dimension).
      label_weights: Optional tensor of non-negative weights for multilabel
        data. The weights are applied when calculating TP, FP, FN, and TN without
        explicit multilabel handling (i.e. when the data is to be flattened).
      thresholds_with_epsilon: Optional boolean indicating whether the leading and
        tailing thresholds has any epsilon added for floating point imprecisions.
        It will change how we handle the leading and tailing bucket.

    Returns:
      Update op.
    """
    num_thresholds = thresholds.shape.as_list()[0]

    if sample_weights is None:
        sample_weights = 1.0
    else:
        sample_weights = tf.__internal__.ops.broadcast_weights(
            tf.cast(sample_weights, dtype=y_pred.dtype), y_pred)
        if not multi_label:
            sample_weights = tf.reshape(sample_weights, [-1])
    if label_weights is None:
        label_weights = 1.0
    else:
        label_weights = tf.expand_dims(label_weights, 0)
        label_weights = tf.__internal__.ops.broadcast_weights(
            label_weights, y_pred)
        if not multi_label:
            label_weights = tf.reshape(label_weights, [-1])
    weights = tf.multiply(sample_weights, label_weights)

    # We shouldn't need this, but in case there are predict value that is out of
    # the range of [0.0, 1.0]
    y_pred = tf.clip_by_value(y_pred, clip_value_min=0.0, clip_value_max=1.0)

    y_true = tf.cast(tf.cast(y_true, tf.bool), y_true.dtype)
    if not multi_label:
        y_true = tf.reshape(y_true, [-1])
        y_pred = tf.reshape(y_pred, [-1])

    true_labels = tf.multiply(y_true, weights)
    false_labels = tf.multiply((1.0 - y_true), weights)

    # Compute the bucket indices for each prediction value.
    # Since the predict value has to be strictly greater than the thresholds,
    # eg, buckets like [0, 0.5], (0.5, 1], and 0.5 belongs to first bucket.
    # We have to use math.ceil(val) - 1 for the bucket.
    bucket_indices = tf.math.ceil(y_pred * (num_thresholds - 1)) - 1

    if thresholds_with_epsilon:
        # In this case, the first bucket should actually take into account since
        # the any prediction between [0.0, 1.0] should be larger than the first
        # threshold. We change the bucket value from -1 to 0.
        bucket_indices = tf.nn.relu(bucket_indices)

    bucket_indices = tf.cast(bucket_indices, tf.int32)

    if multi_label:
        # We need to run bucket segment sum for each of the label class. In the
        # multi_label case, the rank of the label is 2. We first transpose it so
        # that the label dim becomes the first and we can parallel run though them.
        true_labels = tf.transpose(true_labels)
        false_labels = tf.transpose(false_labels)
        bucket_indices = tf.transpose(bucket_indices)

        def gather_bucket(label_and_bucket_index):
            label, bucket_index = (
                label_and_bucket_index[0],
                label_and_bucket_index[1],
            )
            return tf.math.unsorted_segment_sum(
                data=label,
                segment_ids=bucket_index,
                num_segments=num_thresholds,
            )

        tp_bucket_v = tf.vectorized_map(gather_bucket,
                                        (true_labels, bucket_indices))
        fp_bucket_v = tf.vectorized_map(gather_bucket,
                                        (false_labels, bucket_indices))
        tp = tf.transpose(tf.cumsum(tp_bucket_v, reverse=True, axis=1))
        fp = tf.transpose(tf.cumsum(fp_bucket_v, reverse=True, axis=1))
    else:
        tp_bucket_v = tf.math.unsorted_segment_sum(
            data=true_labels,
            segment_ids=bucket_indices,
            num_segments=num_thresholds,
        )
        fp_bucket_v = tf.math.unsorted_segment_sum(
            data=false_labels,
            segment_ids=bucket_indices,
            num_segments=num_thresholds,
        )
        tp = tf.cumsum(tp_bucket_v, reverse=True)
        fp = tf.cumsum(fp_bucket_v, reverse=True)

    # fn = sum(true_labels) - tp
    # tn = sum(false_labels) - fp
    if (ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
            or ConfusionMatrix.FALSE_NEGATIVES in variables_to_update):
        if multi_label:
            total_true_labels = tf.reduce_sum(true_labels, axis=1)
            total_false_labels = tf.reduce_sum(false_labels, axis=1)
        else:
            total_true_labels = tf.reduce_sum(true_labels)
            total_false_labels = tf.reduce_sum(false_labels)

    update_ops = []
    if ConfusionMatrix.TRUE_POSITIVES in variables_to_update:
        variable = variables_to_update[ConfusionMatrix.TRUE_POSITIVES]
        update_ops.append(variable.assign_add(tp))
    if ConfusionMatrix.FALSE_POSITIVES in variables_to_update:
        variable = variables_to_update[ConfusionMatrix.FALSE_POSITIVES]
        update_ops.append(variable.assign_add(fp))
    if ConfusionMatrix.TRUE_NEGATIVES in variables_to_update:
        variable = variables_to_update[ConfusionMatrix.TRUE_NEGATIVES]
        tn = total_false_labels - fp
        update_ops.append(variable.assign_add(tn))
    if ConfusionMatrix.FALSE_NEGATIVES in variables_to_update:
        variable = variables_to_update[ConfusionMatrix.FALSE_NEGATIVES]
        fn = total_true_labels - tp
        update_ops.append(variable.assign_add(fn))
    return tf.group(update_ops)