Example #1
0
 def testDiscreteBottleneckVQCond(self):
   hidden_size = 60
   z_size = 4
   x = tf.zeros(shape=[100, 1, hidden_size], dtype=tf.float32)
   with tf.variable_scope("test2", reuse=tf.AUTO_REUSE):
     means = tf.get_variable("means",
                             shape=[1, 1, 2**z_size, hidden_size],
                             initializer=tf.constant_initializer(0.),
                             dtype=tf.float32)
     ema_count = []
     ema_count_i = tf.get_variable(
         "ema_count",
         [1, 2**z_size],
         initializer=tf.constant_initializer(0),
         trainable=False)
     ema_count.append(ema_count_i)
     ema_means = []
     with tf.colocate_with(means):
       ema_means_i = tf.get_variable("ema_means",
                                     initializer=means.initialized_value()[0],
                                     trainable=False)
       ema_means.append(ema_means_i)
     cond = tf.cast(0.0, tf.bool)
     x_means_dense, x_means_hot, _, _, _ = discretization.discrete_bottleneck(
         x, hidden_size, z_size, 32, means=means, num_blocks=1, cond=cond,
         ema_means=ema_means, ema_count=ema_count, name="test2")
     with self.test_session() as sess:
       sess.run(tf.global_variables_initializer())
       x_means_dense_eval, x_means_hot_eval = sess.run(
           [x_means_dense, x_means_hot])
       means_eval = sess.run(means)
     self.assertEqual(x_means_dense_eval.shape, (100, 1, hidden_size))
     self.assertEqual(x_means_hot_eval.shape, (100, 1))
     self.assertAllClose(means_eval, np.zeros((1, 1, 2**z_size,
                                               hidden_size)))
Example #2
0
    def preprocess_device_grads(self, device_grads):
        compact_grads = (self.benchmark_cnn.params.use_fp16 and
                         self.benchmark_cnn.params.compact_gradient_transfer)
        defer_grads = (
            self.benchmark_cnn.params.variable_consistency == 'relaxed')

        grads_to_reduce = [[g for g, _ in grad_vars]
                           for grad_vars in device_grads]
        algorithm = batch_allreduce.algorithm_from_params(
            self.benchmark_cnn.params)
        reduced_grads, self._warmup_ops = algorithm.batch_all_reduce(
            grads_to_reduce, self.benchmark_cnn.params.gradient_repacking,
            compact_grads, defer_grads, self.benchmark_cnn.params.xla_compile)
        if self.benchmark_cnn.enable_auto_loss_scale:
            # Check for infs or nans
            is_finite_list = []
            with tf.name_scope('check_for_inf_and_nan'):
                for tower_grads in reduced_grads:
                    with tf.colocate_with(tower_grads[0]):
                        # TODO(tanmingxing): Create fused op that takes in a list of tensors
                        # as input and returns scalar boolean True if there are any
                        # infs/nans.
                        is_finite_list.append(
                            tf.reduce_all([
                                tf.reduce_all(tf.is_finite(g))
                                for g in tower_grads
                            ]))
                self.grad_has_inf_nan = tf.logical_not(
                    tf.reduce_all(is_finite_list))
        reduced_device_grads = [[
            (g, v) for g, (_, v) in zip(grads, grad_vars)
        ] for grads, grad_vars in zip(reduced_grads, device_grads)]
        return self.benchmark_cnn.devices, reduced_device_grads
Example #3
0
def learning_rate_schedule(params, global_step):
    """Handles learning rate scaling, linear warmup, and learning rate decay.

  Args:
    params: A dictionary that defines hyperparameters of model.
    global_step: A tensor representing current global step.

  Returns:
    A tensor representing current learning rate.
  """
    base_learning_rate = params['base_learning_rate']
    lr_warmup_step = params['lr_warmup_step']
    first_lr_drop_step = params['first_lr_drop_step']
    second_lr_drop_step = params['second_lr_drop_step']
    scaling_factor = params['global_batch_size'] / constants.DEFAULT_BATCH_SIZE
    adjusted_learning_rate = base_learning_rate * scaling_factor

    with tf.colocate_with(global_step):
        learning_rate = (tf.cast(global_step, dtype=tf.float32) /
                         lr_warmup_step) * adjusted_learning_rate
        learning_rate = tf.where(global_step < lr_warmup_step,
                                 learning_rate,
                                 adjusted_learning_rate * 1.0,
                                 name="learning_rate_schedule_1")
        learning_rate = tf.where(global_step < first_lr_drop_step,
                                 learning_rate,
                                 adjusted_learning_rate * 0.1,
                                 name="learning_rate_schedule_2")
        learning_rate = tf.where(global_step < second_lr_drop_step,
                                 learning_rate,
                                 adjusted_learning_rate * 0.01,
                                 name="learning_rate")

    return learning_rate
    def _finish(self, update_ops, name_scope):
        """Updates beta_power variables every n batches and incrs counter."""
        iter_ = self._get_iter_variable()
        beta1_power, beta2_power = self._get_beta_accumulators()
        with tf.control_dependencies(update_ops):
            with tf.colocate_with(iter_):

                def update_beta_op():
                    update_beta1 = beta1_power.assign(
                        beta1_power * self._beta1_t,
                        use_locking=self._use_locking)
                    update_beta2 = beta2_power.assign(
                        beta2_power * self._beta2_t,
                        use_locking=self._use_locking)
                    return tf.group(update_beta1, update_beta2)

                maybe_update_beta = tf.cond(tf.equal(iter_, 0), update_beta_op,
                                            tf.no_op)
                with tf.control_dependencies([maybe_update_beta]):
                    # TODO(cuong): It is suboptimal here because we have to cast twice
                    # (float to int, and then int to float)
                    update_iter = iter_.assign(tf.cast(
                        tf.mod(tf.cast(iter_ + 1.0, tf.int32), self._n_t),
                        tf.float32),
                                               use_locking=self._use_locking)
        return tf.group(*update_ops + [update_iter, maybe_update_beta],
                        name=name_scope)
Example #5
0
    def __init__(self, hparams):
        self.hparams = hparams
        print("self.hparams.z_size", self.hparams.z_size)
        # Set the discretization bottleneck specific things here
        self.hparams.z_size_per_residual = self.hparams.z_size // \
                                           self.hparams.num_residuals
        print("self.hparams.num_residuals", self.hparams.num_residuals)
        self.hparams.block_dim = int(self.hparams.model_d //
                                     self.hparams.num_blocks)
        self.hparams.block_v_size = 2**(self.hparams.z_size_per_residual /
                                        self.hparams.num_blocks)
        self.hparams.block_v_size = int(self.hparams.block_v_size)
        self.means = tf.get_variable(
            name="means",
            shape=[
                self.hparams.num_blocks, self.hparams.block_v_size,
                self.hparams.block_dim
            ],
            initializer=tf.initializers.variance_scaling(
                distribution="uniform"))

        # Create the shadow variables if we are using EMA
        if self.hparams.ema:
            self.ema_count = tf.get_variable(
                "ema_count",
                [self.hparams.num_blocks, self.hparams.block_v_size],
                initializer=tf.constant_initializer(0),
                trainable=False)
            with tf.colocate_with(self.means):
                self.ema_means = tf.get_variable(
                    "ema_means",
                    initializer=self.means.initialized_value(),
                    trainable=False)
def _apply_to_all_device_tensors(all_device_tensors,
                                 apply_func,
                                 colocate=True):
    """Applies a function to each tensor in `all_device_tensors`.

  A new list of lists of tensors is returned, where every tensor in
  `all_device_tensors` has had `apply_func` called on it. `all_device_tensors`
  is not modified.

  Args:
    all_device_tensors: A list of list of tensors. `all_device_tensors[i][j]` is
      a tensor where `i` is the device index and `j` is the tensor index.
    apply_func: A function taking in three arguments: tensor, device_index,
      tensor_index, and returning a modified tensor.
      `tensor` is `all_device_tensors[device_index][tensor_index]`.
    colocate: If True, apply_func will be run under context manager colocated
      with it's input tensor.
  Returns:
    A list in the same form as `all_device_tensors`, except each tensor has had
    `apply_func` called on it.
  """
    new_all_device_tensors = []
    for device_index, device_tensors in enumerate(all_device_tensors):
        new_device_tensors = []
        for tensor_index, t in enumerate(device_tensors):
            if colocate:
                with tf.colocate_with(t):
                    new_t = apply_func(t, device_index, tensor_index)
            else:
                new_t = apply_func(t, device_index, tensor_index)
            new_device_tensors.append(new_t)
        new_all_device_tensors.append(new_device_tensors)
    return new_all_device_tensors
Example #7
0
 def _transform_2(image):
     with tf.colocate_with(image):
         crop_default = tf.constant([0.0, 0.0, 1.0, 1.0])
     image = tf.image.decode_jpeg(image, channels=3)
     image = tf.image.convert_image_dtype(image, dtype=tf.float32)
     return tf.image.resize_bicubic([image],
                                    [height, width])[0], crop_default
Example #8
0
    def _get_grads_lists_exact(self, tensors):
        if self.mat_type == "Fisher":
            # pylint: disable=g-long-lambda
            mult_func = (lambda loss, index: loss.
                         multiply_fisher_factor_replicated_one_hot(index))
            inner_shape_func = lambda loss: loss.fisher_factor_inner_static_shape
        elif self.mat_type == "GGN":
            # pylint: disable=g-long-lambda
            mult_func = (lambda loss, index: loss.
                         multiply_ggn_factor_replicated_one_hot(index))
            inner_shape_func = lambda loss: loss.fisher_ggn_inner_static_shape

        # Loop over all coordinates of all losses.
        grads_all = []
        for loss in self.layers.losses:
            with tf.colocate_with(self.layers.loss_colocation_ops[loss]):
                for index in np.ndindex(*inner_shape_func(loss)[1:]):
                    value = mult_func(loss, index)
                    coeff = tf.cast(self.layers.loss_coeffs[loss],
                                    dtype=value.dtype)
                    transformed_one_hot = tf.sqrt(coeff) * value
                    grads_flat = tf.gradients(loss.inputs,
                                              nest.flatten(tensors),
                                              grad_ys=transformed_one_hot,
                                              colocate_gradients_with_ops=self.
                                              _colocate_gradients_with_ops)
                    grads_all.append(nest.pack_sequence_as(
                        tensors, grads_flat))
        return tuple(zip(*grads_all))
 def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
     with tf.control_dependencies(update_ops):
         beta1_power, beta2_power = self._get_beta_accumulators()
         with tf.colocate_with(beta1_power):
             update_beta1 = beta1_power.assign(
                 beta1_power * self._beta1_t, use_locking=self._use_locking)
             update_beta2 = beta2_power.assign(
                 beta2_power * self._beta2_t, use_locking=self._use_locking)
     return tf.group(*update_ops + [update_beta1, update_beta2],
                     name=name_scope)
Example #10
0
 def _multiply_across_losses(self, mult_func, vecs, coeff_mode="regular"):
     products = []
     for loss, vec in zip(self._losses, vecs):
         with tf.colocate_with(self._loss_colocation_ops[loss]):
             if coeff_mode == "regular":
                 multiplier = self._get_loss_coeff(loss)
             elif coeff_mode == "sqrt":
                 multiplier = tf.sqrt(self._get_loss_coeff(loss))
             val = mult_func(loss, vec)
             products.append(tf.cast(multiplier, dtype=val.dtype) * val)
     return tuple(products)
Example #11
0
  def model_fn(features, labels, mode, params):
    """Model computational graph."""
    del labels
    del params

    total_loss, monitor_dict = eval(FLAGS.loss_type)(features, mode)

    #### Check model parameters
    num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
    tf.logging.info("#params: %d", num_params)

    if FLAGS.verbose:
      format_str = "{{:<{0}s}}\t{{}}".format(
          max([len(v.name) for v in tf.trainable_variables()]))
      for v in tf.trainable_variables():
        tf.logging.info(format_str.format(v.name, v.get_shape()))

    #### Evaluation mode
    if mode == tf.estimator.ModeKeys.EVAL:
      #### Reduce sum losses from all TPU cores
      with tf.colocate_with(total_loss):
        total_loss = tf.tpu.cross_replica_sum(total_loss)
        total_loss = total_loss / FLAGS.num_hosts / FLAGS.num_core_per_host
      metric_loss = tf.reshape(total_loss, [1])

      #### Constructing evaluation TPUEstimatorSpec with new cache.
      eval_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=(metric_fn, [metric_loss]))

      return eval_spec

    #### Get the train op
    train_op, optim_dict = optimization.get_train_op(total_loss)
    monitor_dict.update(optim_dict)

    #### Customized initial checkpoint
    scaffold_fn = model_utils.custom_initialization(FLAGS.init_global_vars)

    #### Creating host calls
    host_call = model_utils.construct_scalar_host_call(
        monitor_dict=monitor_dict,
        model_dir=FLAGS.model_dir,
        prefix="train/",
        reduce_fn=tf.reduce_mean)

    #### Constructing training TPUEstimatorSpec with new cache.
    train_spec = tf.estimator.tpu.TPUEstimatorSpec(
        mode=mode, loss=total_loss, train_op=train_op, host_call=host_call,
        scaffold_fn=scaffold_fn)

    return train_spec
Example #12
0
def init_vq_bottleneck(bottleneck_size, hidden_size):
    """Get lookup table for VQ bottleneck."""
    means = tf.get_variable(name="means",
                            shape=[bottleneck_size, hidden_size],
                            initializer=tf.uniform_unit_scaling_initializer())
    ema_count = tf.get_variable(name="ema_count",
                                shape=[bottleneck_size],
                                initializer=tf.constant_initializer(0),
                                trainable=False)
    with tf.colocate_with(means):
        ema_means = tf.get_variable(name="ema_means",
                                    initializer=means.initialized_value(),
                                    trainable=False)

    return means, ema_means, ema_count
Example #13
0
 def _create_slots(self, var_list):
   for v in var_list:
     with tf.colocate_with(v):
       if self._momentum > 0:
         self._zeros_slot(v, "momentum", self._name)
       shape = np.array(v.get_shape())
       var_rank = len(shape)
       # We special case vectors and scalars as we can run the diagonal adagrad
       # update for those parameters.
       if var_rank > 1:
         for i, d in enumerate(shape):
           d_tensor = tf.convert_to_tensor(d)
           diag_init = tf.zeros([d_tensor])
           _ = self._get_or_make_slot(v, diag_init, "accumulator_" + str(i),
                                      self._name)
       else:
         _ = self._zeros_slot(v, "accumulator", self._name)
Example #14
0
    def _prepare_variables(self):
        """Prepare Variables for YellowFin.

    Returns:
      Grad**2, Norm, Norm**2, Mean(Norm**2) ops
    """
        self._moving_averager = tf.train.ExponentialMovingAverage(
            decay=self._beta, zero_debias=self._zero_debias)
        # assert self._grad is not None and len(self._grad) > 0
        # List for the returned Operations
        prepare_variables_op = []

        # Get per var g**2 and norm**2
        self._grad_squared = []
        self._grad_norm_squared = []

        # Gradient squared
        for v, g in zip(self._vars, self._grad):
            if g is None: continue
            with tf.colocate_with(v):
                self._grad_squared.append(tf.square(g))

        # Norm squared.
        self._grad_norm_squared = [
            tf.reduce_sum(g_sq) for g_sq in self._grad_squared
        ]

        if self._sparsity_debias:
            avg_op_sparsity = self._grad_sparsity()
            prepare_variables_op.append(avg_op_sparsity)

        # The following running average on squared norm of gradient
        # is shared by grad_var and dist_to_opt
        avg_op = self._moving_averager.apply(self._grad_norm_squared)

        with tf.control_dependencies([avg_op]):
            self._grad_norm_squared_avg = [
                self._moving_averager.average(val)
                for val in self._grad_norm_squared
            ]
            self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
            self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)

        prepare_variables_op.append(avg_op)
        return tf.group(*prepare_variables_op)
Example #15
0
    def _get_transformed_random_signs(self):
        if self.mat_type == "Fisher":
            mult_func = lambda loss, index: loss.multiply_fisher_factor(index)
            inner_shape_func = lambda loss: loss.fisher_factor_inner_shape
        elif self.mat_type == "GGN":
            mult_func = lambda loss, index: loss.multiply_ggn_factor(index)
            inner_shape_func = lambda loss: loss.ggn_factor_inner_shape

        transformed_random_signs = []
        for loss in self.layers.losses:
            with tf.colocate_with(self.layers.loss_colocation_ops[loss]):
                value = mult_func(
                    loss,
                    utils.generate_random_signs(inner_shape_func(loss),
                                                dtype=loss.dtype))
                coeff = tf.cast(self.layers.loss_coeffs[loss],
                                dtype=value.dtype)
                transformed_random_signs.append(tf.sqrt(coeff) * value)
        return transformed_random_signs
Example #16
0
  def _finish(self, update_ops, name_scope):
    """Updates beta_power variables every n batches and incrs counter."""
    iter_ = self._get_iter_variable()
    beta1_power, beta2_power = self._get_beta_accumulators()
    with tf.control_dependencies(update_ops):
      with tf.colocate_with(iter_):

        def update_beta_op():
          update_beta1 = beta1_power.assign(
              beta1_power * self._beta1_t,
              use_locking=self._use_locking)
          update_beta2 = beta2_power.assign(
              beta2_power * self._beta2_t,
              use_locking=self._use_locking)
          return tf.group(update_beta1, update_beta2)
        maybe_update_beta = tf.cond(
            tf.equal(iter_, 0), update_beta_op, tf.no_op)
        with tf.control_dependencies([maybe_update_beta]):
          update_iter = iter_.assign(tf.mod(iter_ + 1, self._n_t),
                                     use_locking=self._use_locking)
    return tf.group(
        *update_ops + [update_iter, maybe_update_beta], name=name_scope)
Example #17
0
    def __init__(self, *args, **kwargs):
        super(TransformerAE, self).__init__(*args, **kwargs)
        self.predict_mask = 1.0

        # Define bottleneck function
        self._hparams.bottleneck = functools.partial(
            discretization.discrete_bottleneck,
            hidden_size=self._hparams.hidden_size,
            z_size=self._hparams.z_size,
            filter_size=self._hparams.filter_size,
            bottleneck_kind=self._hparams.bottleneck_kind,
            num_blocks=self._hparams.num_blocks,
            num_residuals=self.hparams.num_residuals,
            reshape_method=self._hparams.reshape_method,
            beta=self._hparams.beta,
            ema=self._hparams.ema,
            epsilon=self._hparams.epsilon,
            decay=self._hparams.decay,
            random_top_k=self._hparams.random_top_k,
            soft_em=self.hparams.soft_em,
            num_samples=self.hparams.num_samples,
            softmax_k=self._hparams.softmax_k,
            temperature_warmup_steps=self._hparams.temperature_warmup_steps,
            do_hard_gumbel_softmax=self._hparams.do_hard_gumbel_softmax,
            num_flows=self._hparams.num_flows,
            approximate_gs_entropy=self._hparams.approximate_gs_entropy,
            discrete_mix=self._hparams.d_mix,
            noise_dev=self._hparams.noise_dev,
            startup_steps=self.hparams.startup_steps,
            summary=_DO_SUMMARIES)
        # Set the discretization bottleneck specific things here
        if self._hparams.bottleneck_kind in ["dvq", "gumbel-softmax-dvq"]:
            z_size_per_residual = self._hparams.z_size / self._hparams.num_residuals
            block_dim = int(self._hparams.hidden_size //
                            self._hparams.num_blocks)
            block_v_size = 2**(z_size_per_residual / self._hparams.num_blocks)
            block_v_size = int(block_v_size)

            if self._hparams.reshape_method == "project":
                tf.logging.info("Using projections for DVQ")
                tf.logging.info("Trainable projections = {}".format(
                    self._hparams.trainable_projections))

                projection_tensors = tf.get_variable(
                    name="projection",
                    shape=[
                        self._hparams.num_residuals, self._hparams.num_blocks,
                        self._hparams.hidden_size, block_dim
                    ],
                    initializer=tf.initializers.glorot_uniform(),
                    trainable=self._hparams.trainable_projections)

                self._hparams.bottleneck = functools.partial(
                    self._hparams.bottleneck,
                    projection_tensors=projection_tensors)
            elif self._hparams.reshape_method == "slice":
                tf.logging.info("Using slices for DVQ")
            else:
                raise ValueError("Unknown reshape method")

            means = tf.get_variable(
                name="means",
                shape=[
                    self._hparams.num_residuals, self._hparams.num_blocks,
                    block_v_size, block_dim
                ],
                initializer=tf.uniform_unit_scaling_initializer())

            # Create the shadow variables if we are using EMA
            ema_count = None
            ema_means = None
            if self._hparams.ema:
                ema_count = []
                for i in range(self._hparams.num_residuals):
                    ema_count_i = tf.get_variable(
                        "ema_count_{}".format(i),
                        [self._hparams.num_blocks, block_v_size],
                        initializer=tf.constant_initializer(0),
                        trainable=False)
                    ema_count.append(ema_count_i)
                with tf.colocate_with(means):
                    ema_means = []
                    for i in range(self._hparams.num_residuals):
                        ema_means_i = tf.get_variable(
                            "ema_means_{}".format(i),
                            [
                                self._hparams.num_blocks, block_v_size,
                                block_dim
                            ],
                            initializer=(
                                lambda shape, dtype=None, partition_info=None,  # pylint: disable=g-long-lambda
                                verify_shape=None: means.initialized_value()[i]
                            ),
                            trainable=False)
                        ema_means.append(ema_means_i)

            # Update bottleneck
            self._hparams.bottleneck = functools.partial(
                self._hparams.bottleneck,
                means=means,
                ema_count=ema_count,
                ema_means=ema_means)
def _clip_by_global_norm(t_list, clip_norm, use_norm, name=None):
    """Clips values of multiple tensors by the ratio of the sum of their norms.
  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
  this operation returns a list of clipped tensors `list_clipped`
  and the global norm (`global_norm`) of all tensors in `t_list`. The global
  norm is expected to be pre-computed and passed as use_norm.
  To perform the clipping, the values `t_list[i]` are set to:
      t_list[i] * clip_norm / max(global_norm, clip_norm)
  where:
      global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
  otherwise they're all shrunk by the global ratio.
  Any of the entries of `t_list` that are of type `None` are ignored.
  This is the correct way to perform gradient clipping (for example, see
  [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063)
  ([pdf](http://arxiv.org/pdf/1211.5063.pdf))).
  However, it is slower than `clip_by_norm()` because all the parameters must be
  ready before the clipping operation can be performed.
  Args:
    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
      norm to use. If not provided, `global_norm()` is used to compute the norm.
    name: A name for the operation (optional).
  Returns:
    list_clipped: A list of `Tensors` of the same type as `list_t`.
    global_norm: A 0-D (scalar) `Tensor` representing the global norm.
  Raises:
    TypeError: If `t_list` is not a sequence.
  """
    if not isinstance(t_list, collections.Sequence) or isinstance(
            t_list, six.string_types):
        raise TypeError('t_list should be a sequence')
    t_list = list(t_list)

    # Removed as use_norm should always be passed
    # if use_norm is None:
    #   use_norm = global_norm(t_list, name)

    with tf.name_scope(name, 'clip_by_global_norm',
                       t_list + [clip_norm]) as name:
        # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
        scale = clip_norm * tf.minimum(
            1.0 / use_norm,
            tf.ones([1], dtype=use_norm.dtype) / clip_norm)

        values = [
            tf.cast(
                tf.convert_to_tensor(
                    t.values if isinstance(t, tf.IndexedSlices) else t,
                    name='t_%d' % i,
                ),
                dtype=tf.float32,
            ) if t is not None else t for i, t in enumerate(t_list)
        ]

        values_clipped = []
        for i, v in enumerate(values):
            if v is None:
                values_clipped.append(None)
            else:
                with tf.colocate_with(v):
                    values_clipped.append(
                        tf.identity(v * scale, name='%s_%d' % (name, i)))

        list_clipped = [
            tf.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance(
                t, tf.IndexedSlices) else c_v
            for (c_v, t) in zip(values_clipped, t_list)
        ]

    return list_clipped, use_norm
Example #19
0
 def _transform_2(image):
     with tf.colocate_with(image):
         crop_default = tf.constant([0.0, 0.0, 1.0, 1.0])
     return tf.image.resize_bicubic([image],
                                    [height, width])[0], crop_default
Example #20
0
def assign_log_moving_mean_exp(log_mean_exp_var, log_value, decay, name=None):
    """Compute the log of the exponentially weighted moving mean of the exp.

  If `log_value` is a draw from a stationary random variable, this function
  approximates `log(E[exp(log_value)])`, i.e., a weighted log-sum-exp. More
  precisely, a `tf.Variable`, `log_mean_exp_var`, is updated by `log_value`
  using the following identity:

  ```none
  log_mean_exp_var =
  = log(decay exp(log_mean_exp_var) + (1 - decay) exp(log_value))
  = log(exp(log_mean_exp_var + log(decay)) + exp(log_value + log1p(-decay)))
  = log_mean_exp_var
    + log(  exp(log_mean_exp_var   - log_mean_exp_var + log(decay))
          + exp(log_value - log_mean_exp_var + log1p(-decay)))
  = log_mean_exp_var
    + log_sum_exp([log(decay), log_value - log_mean_exp_var + log1p(-decay)]).
  ```

  In addition to numerical stability, this formulation is advantageous because
  `log_mean_exp_var` can be updated in a lock-free manner, i.e., using
  `assign_add`. (Note: the updates are not thread-safe; it's just that the
  update to the tf.Variable is presumed efficient due to being lock-free.)

  Args:
    log_mean_exp_var: `float`-like `Variable` representing the log of the
      exponentially weighted moving mean of the exp. Same shape as `log_value`.
    log_value: `float`-like `Tensor` representing a new (streaming) observation.
      Same shape as `log_mean_exp_var`.
    decay: A `float`-like `Tensor`. The moving mean decay. Typically close to
      `1.`, e.g., `0.999`.
    name: Optional name of the returned operation.

  Returns:
    log_mean_exp_var: A reference to the input 'Variable' tensor with the
      `log_value`-updated log of the exponentially weighted moving mean of exp.

  Raises:
    TypeError: if `log_mean_exp_var` does not have float type `dtype`.
    TypeError: if `log_mean_exp_var`, `log_value`, `decay` have different
      `base_dtype`.
  """
    with tf1.name_scope(name, "assign_log_moving_mean_exp",
                        [log_mean_exp_var, log_value, decay]):
        # We want to update the variable in a numerically stable and lock-free way.
        # To do this, observe that variable `x` updated by `v` is:
        # x = log(w exp(x) + (1-w) exp(v))
        #   = log(exp(x + log(w)) + exp(v + log1p(-w)))
        #   = x + log(exp(x - x + log(w)) + exp(v - x + log1p(-w)))
        #   = x + lse([log(w), v - x + log1p(-w)])
        with tf1.colocate_with(log_mean_exp_var):
            base_dtype = log_mean_exp_var.dtype.base_dtype
            if not base_dtype.is_floating:
                raise TypeError(
                    "log_mean_exp_var.base_dtype({}) does not have float type "
                    "`dtype`.".format(base_dtype.name))
            log_value = tf.convert_to_tensor(value=log_value,
                                             dtype=base_dtype,
                                             name="log_value")
            decay = tf.convert_to_tensor(value=decay,
                                         dtype=base_dtype,
                                         name="decay")
            delta = (log_value - log_mean_exp_var)[tf.newaxis, ...]
            x = tf.concat([
                tf.math.log(decay) * tf.ones_like(delta),
                delta + tf.math.log1p(-decay)
            ],
                          axis=0)
            x = tf.reduce_logsumexp(input_tensor=x, axis=0)
            return log_mean_exp_var.assign_add(x)
Example #21
0
def assign_moving_mean_variance(mean_var,
                                variance_var,
                                value,
                                decay,
                                name=None):
    """Compute exponentially weighted moving {mean,variance} of a streaming value.

  The `value` updated exponentially weighted moving `mean_var` and
  `variance_var` are given by the following recurrence relations:

  ```python
  variance_var = decay * (variance_var + (1 - decay) * (value - mean_var)**2)
  mean_var     = decay * mean_var + (1 - decay) * value
  ```

  Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
  the lag-1 mean.

  For derivation justification, see [Finch (2009; Eq. 143)][1].
  Parameterization: Finch's `alpha` is `1 - decay`.

  Args:
    mean_var: `float`-like `Variable` representing the exponentially weighted
      moving mean. Same shape as `variance_var` and `value`.
    variance_var: `float`-like `Variable` representing the
      exponentially weighted moving variance. Same shape as `mean_var` and
      `value`.
    value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`.
    decay: A `float`-like `Tensor`. The moving mean decay. Typically close to
      `1.`, e.g., `0.999`.
    name: Optional name of the returned operation.

  Returns:
    mean_var: `Variable` representing the `value`-updated exponentially weighted
      moving mean.
    variance_var: `Variable` representing the `value`-updated
      exponentially weighted moving variance.

  Raises:
    TypeError: if `mean_var` does not have float type `dtype`.
    TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different
      `base_dtype`.

  #### References

  [1]: Tony Finch. Incremental calculation of weighted mean and variance.
       _Technical Report_, 2009.
       http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
  """
    with tf1.name_scope(name, "assign_moving_mean_variance",
                        [variance_var, mean_var, value, decay]):
        with tf1.colocate_with(variance_var):
            with tf1.colocate_with(mean_var):
                base_dtype = mean_var.dtype.base_dtype
                if not base_dtype.is_floating:
                    raise TypeError(
                        "mean_var.base_dtype({}) does not have float type "
                        "`dtype`.".format(base_dtype.name))
                if base_dtype != variance_var.dtype.base_dtype:
                    raise TypeError(
                        "mean_var.base_dtype({}) != variance_var.base_dtype({})"
                        .format(base_dtype.name,
                                variance_var.dtype.base_dtype.name))
                value = tf.convert_to_tensor(value=value,
                                             dtype=base_dtype,
                                             name="value")
                decay = tf.convert_to_tensor(value=decay,
                                             dtype=base_dtype,
                                             name="decay")
                delta = value - mean_var
                with tf.control_dependencies([delta]):
                    # We want mean_{t+1} = decay * mean_t + (1. - decay) * value
                    # We compute mean += decay * mean_t - mean_t + (1. - decay) * value =
                    #   = (1. - decay) * (value - mean_t)
                    mean_var = mean_var.assign_add((1. - decay) * delta)
                    # We want variance_{t+1} = decay * (variance_t +
                    #   + (1 - decay) * (value - mean_var)**2).
                    # We compute variance -= variance_t - decay * (variance_t +
                    #     + (1 - decay) * (value - mean_var)**2) =
                    #   = (1 - decay) * variance_t
                    #     - decay * (1 - decay) * (value - mean_var)**2
                    #   = (1 - decay) * (variance_t - decay * (value - mean_var)**2).
                    variance_var = variance_var.assign_sub(
                        (1. - decay) *
                        (variance_var - decay * tf.square(delta)))
                return mean_var, variance_var