Esempio n. 1
0
def moving_average_update(variable, value, momentum):
  try:
    return moving_averages.assign_moving_average(
        variable, value, momentum, zero_debias=False)
  except TypeError:
    return moving_averages.assign_moving_average(
        variable, value, momentum)
Esempio n. 2
0
    def __call__(self, input_layer, epsilon=1e-5, decay=0.9, name="batch_norm",
                 in_dim=None, phase=Phase.train):
        shape = input_layer.shape
        shp = in_dim or shape[-1]
        with tf.variable_scope(name) as scope:
            self.mean = self.variable('mean', [shp], init=tf.constant_initializer(0.), train=False)
            self.variance = self.variable('variance', [shp], init=tf.constant_initializer(1.0), train=False)

            self.gamma = self.variable("gamma", [shp], init=tf.random_normal_initializer(1., 0.02))
            self.beta = self.variable("beta", [shp], init=tf.constant_initializer(0.))

            if phase == Phase.train:
                mean, variance = tf.nn.moments(input_layer.tensor, [0, 1, 2])
                mean.set_shape((shp,))
                variance.set_shape((shp,))

                update_moving_mean = moving_averages.assign_moving_average(self.mean, mean, decay)
                update_moving_variance = moving_averages.assign_moving_average(self.variance, variance, decay)

                with tf.control_dependencies([update_moving_mean, update_moving_variance]):
                    normalized_x = tf.nn.batch_norm_with_global_normalization(
                        input_layer.tensor, mean, variance, self.beta, self.gamma, epsilon,
                        scale_after_normalization=True)
            else:
                normalized_x = tf.nn.batch_norm_with_global_normalization(
                    input_layer.tensor, self.mean, self.variance,
                    self.beta, self.gamma, epsilon,
                    scale_after_normalization=True)
            return input_layer.with_tensor(normalized_x, parameters=self.vars)
Esempio n. 3
0
def bacthnorm(inputs, scope, epsilon=1e-05, momentum=0.99, is_training=True):
    inputs_shape = inputs.get_shape().as_list()# 输出 形状尺寸
    params_shape = inputs_shape[-1:]# 输入参数的长度
    axis = list(range(len(inputs_shape) - 1))

    with tf.variable_scope(scope):
        beta = create_variable("beta", params_shape,
                               initializer=tf.zeros_initializer())
        gamma = create_variable("gamma", params_shape,
                                initializer=tf.ones_initializer())
        # 均值 常量 不需要训练 for inference
        moving_mean = create_variable("moving_mean", params_shape,
                            initializer=tf.zeros_initializer(), trainable=False)
		# 方差 常量 不需要训练
        moving_variance = create_variable("moving_variance", params_shape,
                            initializer=tf.ones_initializer(), trainable=False)
    if is_training:
        mean, variance = tf.nn.moments(inputs, axes=axis)# 计算均值和方差
		# 移动平均求 均值和 方差  考虑上一次的量 xt = a * x_t-1 +(1-a)*x_now
        update_move_mean = moving_averages.assign_moving_average(moving_mean,
                                                mean, decay=momentum)
        update_move_variance = moving_averages.assign_moving_average(moving_variance,
                                                variance, decay=momentum)
        tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_mean)
        tf.add_to_collection(UPDATE_OPS_COLLECTION, update_move_variance)
    else:
        mean, variance = moving_mean, moving_variance
    return tf.nn.batch_normalization(inputs, mean, variance, beta, gamma, epsilon)
Esempio n. 4
0
def bn(x, c):
    x_shape = x.get_shape()
    params_shape = x_shape[-1:]

    if c["use_bias"]:
        bias = _get_variable("bias", params_shape, initializer=tf.zeros_initializer)
        return x + bias

    axis = list(range(len(x_shape) - 1))

    beta = _get_variable("beta", params_shape, initializer=tf.zeros_initializer)
    gamma = _get_variable("gamma", params_shape, initializer=tf.ones_initializer)

    moving_mean = _get_variable("moving_mean", params_shape, initializer=tf.zeros_initializer, trainable=False)
    moving_variance = _get_variable("moving_variance", params_shape, initializer=tf.ones_initializer, trainable=False)

    # These ops will only be preformed when training.
    mean, variance = tf.nn.moments(x, axis)
    update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, BN_DECAY)
    update_moving_variance = moving_averages.assign_moving_average(moving_variance, variance, BN_DECAY)
    tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean)
    tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance)

    mean, variance = control_flow_ops.cond(
        c["is_training"], lambda: (mean, variance), lambda: (moving_mean, moving_variance)
    )

    x = tf.nn.batch_normalization(x, mean, variance, beta, gamma, BN_EPSILON)
    # x.set_shape(inputs.get_shape()) ??

    return x
Esempio n. 5
0
 def _delay_updates():
   """Internal function that delay updates moving_vars if is_training."""
   update_moving_mean = moving_averages.assign_moving_average(
       moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
   update_moving_variance = moving_averages.assign_moving_average(
       moving_variance, variance, decay, zero_debias=False)
   return update_moving_mean, update_moving_variance
Esempio n. 6
0
def batch_norm(x, decay=0.999, epsilon=1e-03, is_training=True,
               scope="scope"):
    x_shape = x.get_shape()
    num_inputs = x_shape[-1]
    reduce_dims = list(range(len(x_shape) - 1))
    with tf.variable_scope(scope):
        beta = create_var("beta", [num_inputs,],
                               initializer=tf.zeros_initializer())
        gamma = create_var("gamma", [num_inputs,],
                                initializer=tf.ones_initializer())
        # for inference
        moving_mean = create_var("moving_mean", [num_inputs,],
                                 initializer=tf.zeros_initializer(),
                                 trainable=False)
        moving_variance = create_var("moving_variance", [num_inputs],
                                     initializer=tf.ones_initializer(),
                                     trainable=False)
    if is_training:
        mean, variance = tf.nn.moments(x, axes=reduce_dims)
        update_move_mean = moving_averages.assign_moving_average(moving_mean,
                                                mean, decay=decay)
        update_move_variance = moving_averages.assign_moving_average(moving_variance,
                                                variance, decay=decay)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_mean)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_move_variance)
    else:
        mean, variance = moving_mean, moving_variance
    return tf.nn.batch_normalization(x, mean, variance, beta, gamma, epsilon)
Esempio n. 7
0
		def mean_var_with_update():
			mean, variance = tf.nn.moments(x, list(range(len(x.shape) - 1)), name='moments')
			with tf.control_dependencies([
				assign_moving_average(moving_mean, mean, decay),
				assign_moving_average(moving_var, variance, decay)
			]):
				return tf.identity(mean), tf.identity(variance)
Esempio n. 8
0
		def train_phase():
			mean, variance = tf.nn.moments(inputs, axis)
			update_moving_mean = moving_averages.assign_moving_average(moving_mean, mean, decay)
			update_moving_variance = moving_averages.assign_moving_average(moving_variance, 
									variance, decay)
			with tf.control_dependencies([update_moving_mean, update_moving_variance]):
				return tf.identity(mean), tf.identity(variance)
Esempio n. 9
0
 def _force_updates():
   """Internal function forces updates moving_vars if is_training."""
   update_moving_mean = moving_averages.assign_moving_average(
       moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
   update_moving_variance = moving_averages.assign_moving_average(
       moving_variance, variance, decay, zero_debias=False)
   with ops.control_dependencies([update_moving_mean,
                                  update_moving_variance]):
     return array_ops.identity(mean), array_ops.identity(variance)
    def __init__(self, value,
                            decay,
                            weight,
                            truediv=True,
                            collections=None,
                            name=None):

        """Compute the weighted moving average of `value`.
        Conceptually, the weighted moving average is:
          `moving_average(value * weight) / moving_average(weight)`,
        where a moving average updates by the rule
          `new_value = decay * old_value + (1 - decay) * update`
        Internally, this Op keeps moving average variables of both `value * weight`
        and `weight`.
        Args:
          value: A numeric `Tensor`.
          decay: A float `Tensor` or float value.  The moving average decay.
          weight:  `Tensor` that keeps the current value of a weight.
            Shape should be able to multiply `value`.
          truediv:  Boolean, if `True`, dividing by `moving_average(weight)` is
            floating point division.  If `False`, use division implied by dtypes.
          collections:  List of graph collections keys to add the internal variables
            `value * weight` and `weight` to.  Defaults to `[GraphKeys.VARIABLES]`.
          name: Optional name of the returned operation.
            Defaults to "WeightedMovingAvg".
        Returns:
          An Operation that updates and returns the weighted moving average.
        """
        # Unlike assign_moving_average, the weighted moving average doesn't modify
        # user-visible variables. It is the ratio of two internal variables, which are
        # moving averages of the updates.  Thus, the signature of this function is
        # quite different than assign_moving_average.
        if collections is None:
            collections = [ops.GraphKeys.VARIABLES]
        with variable_scope.variable_op_scope(
                [value, weight, decay], name, "WeightedMovingAvg") as scope:
            value_x_weight_var = variable_scope.get_variable(
                "value_x_weight",
                initializer=init_ops.zeros_initializer(value.get_shape(),
                                                       dtype=value.dtype),
                trainable=False,
                collections=collections)
            weight_var = variable_scope.get_variable(
                "weight",
                initializer=init_ops.zeros_initializer(weight.get_shape(),
                                                       dtype=weight.dtype),
                trainable=False,
                collections=collections)
            numerator = assign_moving_average(value_x_weight_var, value * weight, decay)
            denominator = assign_moving_average(weight_var, weight, decay)

            if truediv:
                div = math_ops.truediv
            else:
                div = math_ops.div
            self.average_with_update = div(numerator, denominator+1e-8, name=scope.name)
            self.average = div(value_x_weight_var, weight_var)
Esempio n. 11
0
        def update_mean_var():
            mean, variance = tf.nn.moments(x=incoming, axes=axis)
            update_moving_mean = moving_averages.assign_moving_average(
                variable=moving_mean, value=mean, decay=self.decay, zero_debias=False)
            update_moving_variance = moving_averages.assign_moving_average(
                variable=moving_variance, value=variance, decay=self.decay, zero_debias=False)

            with tf.control_dependencies([update_moving_mean, update_moving_variance]):
                return tf.identity(mean), tf.identity(variance)
Esempio n. 12
0
 def _batch_norm_without_layers(self, input_layer, decay, use_scale,
                                epsilon):
     """Batch normalization on `input_layer` without tf.layers."""
     shape = input_layer.shape
     num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
     beta = self.get_variable(
         'beta', [num_channels],
         tf.float32,
         tf.float32,
         initializer=tf.zeros_initializer())
     if use_scale:
         gamma = self.get_variable(
             'gamma', [num_channels],
             tf.float32,
             tf.float32,
             initializer=tf.ones_initializer())
     else:
         gamma = tf.constant(1.0, tf.float32, [num_channels])
     moving_mean = tf.get_variable(
         'moving_mean', [num_channels],
         tf.float32,
         initializer=tf.zeros_initializer(),
         trainable=False)
     moving_variance = tf.get_variable(
         'moving_variance', [num_channels],
         tf.float32,
         initializer=tf.ones_initializer(),
         trainable=False)
     if self.phase_train:
         bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
             input_layer,
             gamma,
             beta,
             epsilon=epsilon,
             data_format=self.data_format,
             is_training=True)
         mean_update = moving_averages.assign_moving_average(
             moving_mean, batch_mean, decay=decay, zero_debias=False)
         variance_update = moving_averages.assign_moving_average(
             moving_variance,
             batch_variance,
             decay=decay,
             zero_debias=False)
         tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
         tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
     else:
         bn, _, _ = tf.nn.fused_batch_norm(
             input_layer,
             gamma,
             beta,
             mean=moving_mean,
             variance=moving_variance,
             epsilon=epsilon,
             data_format=self.data_format,
             is_training=False)
     return bn
Esempio n. 13
0
 def _update_mean_var():
   """Internal function that updates mean and variance during training."""
   axis = [0, 1, 2] if convnet else [0]
   mean, var = nn.moments(tensor_in, axis)
   update_moving_mean = moving_averages.assign_moving_average(
       moving_mean, mean, decay)
   update_moving_var = moving_averages.assign_moving_average(
       moving_var, var, decay)
   with ops.control_dependencies([update_moving_mean, update_moving_var]):
     return array_ops_.identity(mean), array_ops_.identity(var)
Esempio n. 14
0
 def _update_renorm_variable(var, weight, value):
   """Updates a moving average and weight, returns the unbiased value."""
   # Update the variables without zero debiasing. The debiasing will be
   # accomplished by dividing the exponential moving average by the weight.
   # For example, after a single update, the moving average would be
   # (1-decay) * value. and the weight will be 1-decay, with their ratio
   # giving value.
   new_var = moving_averages.assign_moving_average(
       var, value, decay, zero_debias=False)
   new_weight = moving_averages.assign_moving_average(
       weight, 1., decay, zero_debias=False)
   return new_var / new_weight
Esempio n. 15
0
  def _batch_norm(self, name, x):
    with tf.variable_scope(name):
      # 输入通道维数
      params_shape = [x.get_shape()[-1]]
      # offset
      beta = tf.get_variable('beta', 
                             params_shape, 
                             tf.float32,
                             initializer=tf.constant_initializer(0.0, tf.float32))
      # scale
      gamma = tf.get_variable('gamma', 
                              params_shape, 
                              tf.float32,
                              initializer=tf.constant_initializer(1.0, tf.float32))

      if self.mode == 'train':
        # 为每个通道计算均值、标准差
        mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')
        # 新建或建立测试阶段使用的batch均值、标准差
        moving_mean = tf.get_variable('moving_mean', 
                                      params_shape, tf.float32,
                                      initializer=tf.constant_initializer(0.0, tf.float32),
                                      trainable=False)
        moving_variance = tf.get_variable('moving_variance', 
                                          params_shape, tf.float32,
                                          initializer=tf.constant_initializer(1.0, tf.float32),
                                          trainable=False)
        # 添加batch均值和标准差的更新操作(滑动平均)
        # moving_mean = moving_mean * decay + mean * (1 - decay)
        # moving_variance = moving_variance * decay + variance * (1 - decay)
        self._extra_train_ops.append(moving_averages.assign_moving_average(
                                                        moving_mean, mean, 0.9))
        self._extra_train_ops.append(moving_averages.assign_moving_average(
                                                        moving_variance, variance, 0.9))
      else:
        # 获取训练中积累的batch均值、标准差
        mean = tf.get_variable('moving_mean', 
                               params_shape, tf.float32,
                               initializer=tf.constant_initializer(0.0, tf.float32),
                               trainable=False)
        variance = tf.get_variable('moving_variance', 
                                   params_shape, tf.float32,
                                   initializer=tf.constant_initializer(1.0, tf.float32),
                                   trainable=False)
        # 添加到直方图总结
        tf.summary.histogram(mean.op.name, mean)
        tf.summary.histogram(variance.op.name, variance)

      # BN层:((x-mean)/var)*gamma+beta
      y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001)
      y.set_shape(x.get_shape())
      return y
 def testAssignMovingAverageNewNamingMultipleCalls(self):
   with variable_scope.variable_scope("scope1") as vs1:
     with variable_scope.variable_scope("scope2"):
       var = variables.Variable(1.0, name="Var")
       moving_averages.assign_moving_average(var, 0.0, 0.99)
       moving_averages.assign_moving_average(var, 0.0, 0.99)
   expected_names = ["scope1/scope2/Var:0",
                     "scope1/scope2/scope1/scope2/Var/biased:0",
                     "scope1/scope2/scope1/scope2/Var/local_step:0",
                     "scope1/scope2/scope1/scope2/Var/biased_1:0",
                     "scope1/scope2/scope1/scope2/Var/local_step_1:0"]
   actual_names = [v.name for v in vs1.global_variables()]
   self.assertSetEqual(set(expected_names), set(actual_names))
Esempio n. 17
0
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay):
    # TODO is there a way to use zero_debias in multi-GPU?
    update_op1 = moving_averages.assign_moving_average(
        moving_mean, batch_mean, decay, zero_debias=False,
        name='mean_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')
    add_model_variable(moving_mean)
    add_model_variable(moving_var)

    # seems faster than delayed update, but might behave otherwise in distributed settings.
    with tf.control_dependencies([update_op1, update_op2]):
        return tf.identity(xn, name='output')
Esempio n. 18
0
 def _do_update():
   # Update the variables without zero debiasing. The debiasing will be
   # accomplished by dividing the exponential moving average by the weight.
   # For example, after a single update, the moving average would be
   # (1-decay) * value. and the weight will be 1-decay, with their ratio
   # giving the value.
   # Make sure the weight is not updated until before r and d computation.
   with ops.control_dependencies([value]):
     weight_value = array_ops.constant(1., dtype=weight.dtype)
   new_var = moving_averages.assign_moving_average(
       var, value, self.renorm_momentum, zero_debias=False)
   new_weight = moving_averages.assign_moving_average(
       weight, weight_value, self.renorm_momentum, zero_debias=False)
   return new_var / new_weight
    def __init__(self, value, decay,
                 truediv=True,
                 collections=None,
                 reduction_indices=None,
                 name=None):
        self.value = value
        self.reduction_indices = reduction_indices or [0]

        eps = 1e-8
        if truediv:
            div = math_ops.truediv
        else:
            div = math_ops.div
        if collections is None:
            collections = [ops.GraphKeys.VARIABLES]

        value_shape = value.get_shape().as_list()
        shape = []
        for dim in range(len(value_shape)):
            if dim in self.reduction_indices:
                shape.append(1)
            else:
                shape.append(value_shape[dim])

        with variable_scope.variable_op_scope(
                [value, decay], name, "MomentTracker") as scope:

            mean_x_weight_var = variable_scope.get_variable("mean_x_weight", trainable=False, collections=collections,
                initializer=init_ops.zeros_initializer(shape, dtype=value.dtype))

            variance_x_weight_var = variable_scope.get_variable("variance_x_weight", trainable=False,
                collections=collections, initializer=init_ops.zeros_initializer(shape, dtype=value.dtype))

            weight_var = variable_scope.get_variable("weight", trainable=False, collections=collections,
                initializer=init_ops.zeros_initializer([1], dtype=tf.float32))

            self.tracked_mean = div(mean_x_weight_var, weight_var + eps)
            self.tracked_variance = div(variance_x_weight_var, weight_var + eps)

            self.batch_mean, self.batch_variance = tf.nn.moments(self.value, axes=self.reduction_indices,
                                                                 shift=self.tracked_mean, keep_dims=True)

            mean_numerator = assign_moving_average(mean_x_weight_var, self.batch_mean, decay)
            variance_numerator = assign_moving_average(variance_x_weight_var, self.batch_variance, decay)
            denominator = assign_moving_average(weight_var, 1.0, decay)

            self.update_mean = div(mean_numerator, denominator + eps, name=scope.name)
            self.update_variance = div(variance_numerator, denominator + eps, name=scope.name)
Esempio n. 20
0
def update_bn_ema(xn, batch_mean, batch_var,
                  moving_mean, moving_var, decay, internal_update):
    update_op1 = moving_averages.assign_moving_average(
        moving_mean, batch_mean, decay, zero_debias=False,
        name='mean_ema_op')
    update_op2 = moving_averages.assign_moving_average(
        moving_var, batch_var, decay, zero_debias=False,
        name='var_ema_op')

    if internal_update:
        with tf.control_dependencies([update_op1, update_op2]):
            return tf.identity(xn, name='output')
    else:
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2)
        return tf.identity(xn, name='output')
Esempio n. 21
0
 def forward(self, inputs):
     mean, var = tf.nn.moments(inputs, self.axes)
     if self.is_train:
         # update moving_mean and moving_var
         self.moving_mean = moving_averages.assign_moving_average(
             self.moving_mean, mean, self.decay, zero_debias=False
         )
         self.moving_var = moving_averages.assign_moving_average(self.moving_var, var, self.decay, zero_debias=False)
         outputs = batch_normalization(inputs, mean, var, self.beta, self.gamma, self.epsilon, self.data_format)
     else:
         outputs = batch_normalization(
             inputs, self.moving_mean, self.moving_var, self.beta, self.gamma, self.epsilon, self.data_format
         )
     if self.act:
         outputs = self.act(outputs)
     return outputs
 def replica_fn():
   var = variables.Variable([0.0, 0.0])
   val = constant_op.constant([1.0 + replica_id[0], 2.0 - replica_id[0]])
   replica_id[0] += 1
   decay = 0.25
   assign = moving_averages.assign_moving_average(var, val, decay)
   return var, assign.op
  def testCrossDeviceWithoutZeroDebias(self, distribution):
    with distribution.scope(), self.cached_session() as sess:
      var = variables.Variable([10.0, 11.0])
      val = constant_op.constant([1.0, 2.0])
      decay = 0.25
      # NOTE(josh11b): We currently generate an error if val is a PerReplica
      # value.
      assign = moving_averages.assign_moving_average(
          var, val, decay, zero_debias=False)

      variables.global_variables_initializer().run()
      self.assertAllClose([10.0, 11.0], var.eval())
      sess.run(assign)
      average_val = [1.0, 2.0]
      val_weight = 1.0 - 0.25
      self.assertAllClose(
          [10.0 * 0.25 + average_val[0] * val_weight,
           11.0 * 0.25 + average_val[1] * val_weight],
          var.eval())
      # Also try assign.op.
      sess.run(assign.op)
      orig_weight = 0.25 * 0.25
      val_weight = 1.0 - orig_weight
      self.assertAllClose(
          [10.0 * orig_weight + average_val[0] * val_weight,
           11.0 * orig_weight + average_val[1] * val_weight],
          var.eval())
Esempio n. 24
0
def weighted_resample(inputs, weights, overall_rate, scope=None,
                      mean_decay=0.999, warmup=10, seed=None):
  """Performs an approximate weighted resampling of `inputs`.

  This method chooses elements from `inputs` where each item's rate of
  selection is proportional to its value in `weights`, and the average
  rate of selection across all inputs (and many invocations!) is
  `overall_rate`.

  Args:
    inputs: A list of tensors whose first dimension is `batch_size`.
    weights: A `[batch_size]`-shaped tensor with each batch member's weight.
    overall_rate: Desired overall rate of resampling.
    scope: Scope to use for the op.
    mean_decay: How quickly to decay the running estimate of the mean weight.
    warmup: Until the resulting tensor has been evaluated `warmup`
      times, the resampling menthod uses the true mean over all calls
      as its weight estimate, rather than a decayed mean.
    seed: Random seed.

  Returns:
    A list of tensors exactly like `inputs`, but with an unknown (and
      possibly zero) first dimension.
    A tensor containing the effective resampling rate used for each output.

  """
  # Algorithm: Just compute rates as weights/mean_weight *
  # overall_rate. This way the average weight corresponds to the
  # overall rate, and a weight twice the average has twice the rate,
  # etc.
  with ops.name_scope(scope, 'weighted_resample', inputs) as opscope:
    # First: Maintain a running estimated mean weight, with decay
    # adjusted (by also maintaining an invocation count) during the
    # warmup period so that at the beginning, there aren't too many
    # zeros mixed in, throwing the average off.

    with variable_scope.variable_scope(scope, 'estimate_mean', inputs):
      count_so_far = variable_scope.get_local_variable(
          'resample_count', initializer=0)

      estimated_mean = variable_scope.get_local_variable(
          'estimated_mean', initializer=0.0)

      count = count_so_far.assign_add(1)
      real_decay = math_ops.minimum(
          math_ops.truediv((count - 1), math_ops.minimum(count, warmup)),
          mean_decay)

      batch_mean = math_ops.reduce_mean(weights)
      mean = moving_averages.assign_moving_average(
          estimated_mean, batch_mean, real_decay, zero_debias=False)

    # Then, normalize the weights into rates using the mean weight and
    # overall target rate:
    rates = weights * overall_rate / mean

    results = resample_at_rate([rates] + inputs, rates,
                               scope=opscope, seed=seed, back_prop=False)

    return (results[1:], results[0])
Esempio n. 25
0
    def _batch_norm(self, name, x):
        """Batch normalization."""
        with tf.variable_scope(name):
            params_shape = [x.get_shape()[-1]]

            beta = tf.get_variable(
                'beta', params_shape, tf.float32,
                initializer=tf.constant_initializer(0.0, tf.float32))
            gamma = tf.get_variable(
                'gamma', params_shape, tf.float32,
                initializer=tf.constant_initializer(1.0, tf.float32))

            if self.mode == 'train':
                mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')

                moving_mean = tf.get_variable(
                    'moving_mean', params_shape, tf.float32,
                    initializer=tf.constant_initializer(0.0, tf.float32),
                    trainable=False)
                moving_variance = tf.get_variable(
                    'moving_variance', params_shape, tf.float32,
                    initializer=tf.constant_initializer(1.0, tf.float32),
                    trainable=False)

                self._extra_train_ops.append(
                    moving_averages.assign_moving_average(moving_mean, mean,
                                                          0.9))
                self._extra_train_ops.append(
                    moving_averages.assign_moving_average(moving_variance,
                                                          variance, 0.9))
            else:
                mean = tf.get_variable(
                    'moving_mean', params_shape, tf.float32,
                    initializer=tf.constant_initializer(0.0, tf.float32),
                    trainable=False)
                variance = tf.get_variable(
                    'moving_variance', params_shape, tf.float32,
                    initializer=tf.constant_initializer(1.0, tf.float32),
                    trainable=False)
                tf.summary.histogram(mean.op.name, mean)
                tf.summary.histogram(variance.op.name, variance)
            # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper
            # net.
            y = tf.nn.batch_normalization(
                x, mean, variance, beta, gamma, 0.001)
            y.set_shape(x.get_shape())
            return y
 def testAssignMovingAverageNewNamingMultipleCallsWithReuse(self):
   with variable_scope.variable_scope("scope1") as vs1:
     var = variable_scope.get_variable("Var", shape=[])
     moving_averages.assign_moving_average(var, 0.0, 0.99)
     moving_averages.assign_moving_average(var, 0.0, 0.99)
   with variable_scope.variable_scope(vs1, reuse=True):
     var = variable_scope.get_variable("Var", shape=[])
     moving_averages.assign_moving_average(var, 0.0, 0.99)
     moving_averages.assign_moving_average(var, 0.0, 0.99)
Esempio n. 27
0
  def _fused_batch_norm(self, inputs, training):
    """Returns the output of fused batch norm."""
    beta = self.beta if self.center else self._beta_const
    gamma = self.gamma if self.scale else self._gamma_const

    def _fused_batch_norm_training():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          epsilon=self.epsilon,
          data_format=self._data_format)

    def _fused_batch_norm_inference():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          mean=self.moving_mean,
          variance=self.moving_variance,
          epsilon=self.epsilon,
          is_training=False,
          data_format=self._data_format)

    output, mean, variance = utils.smart_cond(
        training, _fused_batch_norm_training, _fused_batch_norm_inference)
    if not self._bessels_correction_test_only:
      # Remove Bessel's correction to be consistent with non-fused batch norm.
      # Note that the variance computed by fused batch norm is
      # with Bessel's correction.
      sample_size = math_ops.cast(
          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
      variance *= factor

    training_value = utils.constant_value(training)
    if training_value is not False:
      decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
      mean_update = moving_averages.assign_moving_average(
          self.moving_mean, mean, decay, zero_debias=False)
      variance_update = moving_averages.assign_moving_average(
          self.moving_variance, variance, decay, zero_debias=False)
      self.add_update(mean_update, inputs=inputs)
      self.add_update(variance_update, inputs=inputs)

    return output
 def replica_fn():
   var = variables.Variable([10.0, 11.0])
   val = constant_op.constant([1.0 + replica_id[0], 2.0 - replica_id[0]])
   replica_id[0] += 1
   decay = 0.25
   assign = moving_averages.assign_moving_average(
       var, val, decay, zero_debias=False)
   return var, assign
 def replica_fn():
   var = variables.Variable([10.0, 11.0])
   # Here we expect to check the case when input value are variable.
   val = variables.Variable([1., 2.])
   decay = 0.25
   assign = moving_averages.assign_moving_average(
       var, val, decay, zero_debias=False)
   return var, assign
Esempio n. 30
0
 def moving_average(name, value, decay):
   moving_average_variable = vs.get_variable(
       name,
       shape=value.get_shape(),
       dtype=value.dtype,
       initializer=init_ops.zeros_initializer(),
       trainable=False)
   return moving_averages.assign_moving_average(
       moving_average_variable, value, decay, zero_debias=False)
Esempio n. 31
0
def MovingAvgQuantize(inputs,
                      per_channel=False,
                      init_min=-6.0,
                      init_max=6.0,
                      ema_decay=0.999,
                      vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                      name_prefix='MovingAvgQuantize',
                      reuse=None,
                      is_training=True,
                      num_bits=8,
                      narrow_range=False,
                      symmetric=False):
    """Adds a layer that collects quantization ranges as EMAs of input ranges.

  MovingAvgQuantize creates variables called 'min' and 'max', representing the
  interval used for quantization and clamping.

  Args:
    inputs: a tensor containing values to be quantized.
    per_channel: (default False) a boolean specifying whether to use different
      quantization ranges per output channel.
    init_min: a float scalar, the initial value for variable min.
    init_max: a float scalar, the initial value for variable max.
    ema_decay: EMA decay parameter.
    vars_collection: (Optional) collection where to store variables for
      quantization interval ends.
    name_prefix: name_prefix for created nodes.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    is_training: Whether the op is applied to a training or eval graph.
    num_bits: Number of bits to use for quantization, must be between 2 and 8.
    narrow_range: Whether to use the narrow quantization range
      [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1].
    symmetric: If true, use symmetric quantization limits instead of training
      the minimum and maximum of each quantization range separately.
  Returns:
    a tensor containing quantized values.
  """
    with variable_scope.variable_scope(None,
                                       default_name=name_prefix,
                                       values=[inputs],
                                       reuse=reuse) as scope:
        scope.set_partitioner(None)
        input_shape = inputs.get_shape()
        if per_channel:
            input_dim = len(input_shape)
            # Only support quantizing 1-, 2- and 4-dimensional tensors.
            assert input_dim in [1, 2, 4
                                 ], ('Expected 1D, 2D or 4D input, was: %s in '
                                     ' scope: %s' % (input_shape, name_prefix))
            min_max_shape = [input_shape[-1]]
        else:
            min_max_shape = []

        vars_collections = [vars_collection] if vars_collection else []
        min_var = _ModelVariable(
            'min',
            shape=min_max_shape,
            initializer=init_ops.constant_initializer(init_min),
            collections=vars_collections,
            trainable=False)
        max_var = _ModelVariable(
            'max',
            shape=min_max_shape,
            initializer=init_ops.constant_initializer(init_max),
            collections=vars_collections,
            trainable=False)
        if not is_training:
            return _FakeQuantWithMinMaxVars(inputs,
                                            min_var,
                                            max_var,
                                            per_channel=per_channel,
                                            num_bits=num_bits,
                                            narrow_range=narrow_range)
        if per_channel:
            if input_dim == 2:
                reduce_dims = [0]
            elif input_dim == 4:
                reduce_dims = [0, 1, 2]

        if per_channel:
            if input_dim >= 2:
                batch_min = math_ops.reduce_min(inputs,
                                                axis=reduce_dims,
                                                name='BatchMin')
            else:
                batch_min = inputs
        else:
            batch_min = math_ops.reduce_min(inputs, name='BatchMin')

        if per_channel:
            if input_dim >= 2:
                batch_max = math_ops.reduce_max(inputs,
                                                axis=reduce_dims,
                                                name='BatchMax')
            else:
                batch_max = inputs
        else:
            batch_max = math_ops.reduce_max(inputs, name='BatchMax')

        if symmetric:
            if narrow_range:
                min_max_ratio = -1
            else:
                # In two's complement notation, the negative range is slightly larger
                # than the positive range.
                min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits)

            # TFLite requires that 0.0 is always in the [min; max] range. Because
            # batch_min <= batch_max, it follows that range_min <= 0 <= range_max.
            range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio)
            range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio)
        else:
            # TFLite requires that 0.0 is always in the [min; max] range.
            range_min = math_ops.minimum(batch_min, 0.0)
            range_max = math_ops.maximum(batch_max, 0.0)

        assign_min = moving_averages.assign_moving_average(min_var,
                                                           range_min,
                                                           ema_decay,
                                                           name='AssignMinEma')
        assign_max = moving_averages.assign_moving_average(max_var,
                                                           range_max,
                                                           ema_decay,
                                                           name='AssignMaxEma')

        return _FakeQuantWithMinMaxVars(inputs,
                                        assign_min,
                                        assign_max,
                                        per_channel=per_channel,
                                        num_bits=num_bits,
                                        narrow_range=narrow_range)
Esempio n. 32
0
def add_moving_summary(*args, **kwargs):
    """
    Enable moving average summary for some tensors.
    It's only effective in the main training tower, otherwise calling this
    function is a no-op.

    Args:
        args: tensors to summary
        decay (float): the decay rate. Defaults to 0.95.
        collection (str or None): the name of the collection to add EMA-maintaining ops.
            The default will work together with the default
            :class:`MovingAverageSummary` callback.

    Returns:
        [tf.Tensor]: list of tensors returned by assign_moving_average,
            which can be used to maintain the EMA.
    """
    decay = kwargs.pop('decay', 0.95)
    coll = kwargs.pop('collection', MOVING_SUMMARY_OPS_KEY)
    assert len(kwargs) == 0, "Unknown arguments: " + str(kwargs)

    ctx = get_current_tower_context()
    # allow ctx to be none
    if ctx is not None and not ctx.is_main_training_tower:
        return

    if not isinstance(args[0], list):
        v = args
    else:
        log_deprecated(
            "Call add_moving_summary with positional args instead of a list!")
        v = args[0]
    for x in v:
        assert isinstance(x, tf.Tensor), x
        assert x.get_shape().ndims == 0, x.get_shape()
    G = tf.get_default_graph()
    # TODO variable not saved under distributed

    ema_ops = []
    for c in v:
        name = re.sub('tower[0-9]+/', '', c.op.name)
        with G.colocate_with(c), tf.name_scope(None):
            # assign_moving_average creates variables with op names, therefore clear ns first.
            with _enter_vs_reuse_ns('EMA') as vs:
                ema_var = tf.get_variable(
                    name,
                    shape=c.shape,
                    dtype=c.dtype,
                    initializer=tf.constant_initializer(),
                    trainable=False)
                ns = vs.original_name_scope
            with tf.name_scope(ns):  # reuse VS&NS so that EMA_1 won't appear
                ema_op = moving_averages.assign_moving_average(
                    ema_var,
                    c,
                    decay,
                    zero_debias=True,
                    name=name + '_EMA_apply')
            tf.summary.scalar(name + '-summary',
                              ema_op)  # write the EMA value as a summary
            ema_ops.append(ema_op)
    if coll is not None:
        for op in ema_ops:
            # TODO a new collection to summary every step?
            tf.add_to_collection(coll, op)
    return ema_ops
Esempio n. 33
0
def official_batch_norm(inputs,
                        channels,
                        type=False,
                        decay=0.999,
                        center=True,
                        scale=False,
                        epsilon=0.001,
                        activation_fn=None,
                        updates_collections=ops.GraphKeys.UPDATE_OPS,
                        is_training=True,
                        reuse=None,
                        variables_collections=None,
                        outputs_collections=None,
                        trainable=True,
                        scope=None):
    """
        Args:
            inputs: a tensor of size `[batch_size, height, width, channels]`
                or `[batch_size, channels]`.
            type: False is non-convolution batch norm,True is convolution batch norm.
            decay: decay for the moving average.
            center: If True, subtract `beta`. If False, `beta` is ignored.
            scale: If True, multiply by `gamma`. If False, `gamma` is
                not used. When the next layer is linear (also e.g. `nn.relu`), this can be
                disabled since the scaling can be done by the next layer.
            epsilon: small float added to variance to avoid dividing by zero.
            activation_fn: Optional activation function.
            updates_collections: collections to collect the update ops for computation.
            is_training: whether or not the layer is in training mode.
            reuse: whether or not the layer and its variables should be reused.
            variables_collections: optional collections for the variables.
            outputs_collections: collections to add the outputs.
            trainable: If `True` also add variables to the graph collection
                `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
            scope: Optional scope for `variable_op_scope`.
            Returns:
                a tensor representing the output of the operation.
    """
    with variable_scope.variable_scope(scope,
                                       'BatchNorm', [inputs],
                                       reuse=reuse) as sc:
        dtype = inputs.dtype.base_dtype
        axis = [0, 1, 2] if type else [0]
        params_shape = [channels]
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        # param_initializers = {}
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            # beta_initializer = param_initializers.get('beta',init_ops.zeros_initializer)
            beta = variables.model_variable(
                'beta',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.zeros_initializer,
                collections=beta_collections,
                trainable=trainable)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            # gamma_initializer = param_initializers.get('gamma',init_ops.ones_initializer())
            gamma = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=tf.ones_initializer(),
                                             collections=gamma_collections,
                                             trainable=trainable)
        # Create moving_mean and moving_variance variables and add them to the
        # appropiate collections.
        moving_mean_collections = utils.get_variable_collections(
            variables_collections, 'moving_mean')
        # moving_mean_initializer = param_initializers.get('moving_mean', init_ops.zeros_initializer)
        moving_mean = variables.model_variable(
            'moving_mean',
            shape=params_shape,
            dtype=dtype,
            initializer=init_ops.zeros_initializer,
            trainable=False,
            collections=moving_mean_collections)
        moving_variance_collections = utils.get_variable_collections(
            variables_collections, 'moving_variance')
        # moving_variance_initializer = param_initializers.get('moving_variance', init_ops.ones_initializer())
        moving_variance = variables.model_variable(
            'moving_variance',
            shape=params_shape,
            dtype=dtype,
            initializer=tf.ones_initializer(),
            trainable=False,
            collections=moving_variance_collections)
        if is_training:
            # Calculate the moments based on the individual batch.
            mean, variance = nn.moments(inputs, axis, shift=moving_mean)
            # Update the moving_mean and moving_variance moments.
            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay)
            update_moving_variance = moving_averages.assign_moving_average(
                moving_variance, variance, decay)
            if updates_collections is None:
                # Make sure the updates are computed here.
                with ops.control_dependencies(
                    [update_moving_mean, update_moving_variance]):
                    outputs = nn.batch_normalization(inputs, mean, variance,
                                                     beta, gamma, epsilon)
            else:
                # Collect the updates to be computed later.
                ops.add_to_collections(updates_collections, update_moving_mean)
                ops.add_to_collections(updates_collections,
                                       update_moving_variance)
                outputs = nn.batch_normalization(inputs, mean, variance, beta,
                                                 gamma, epsilon)
        else:
            outputs = nn.batch_normalization(inputs, moving_mean,
                                             moving_variance, beta, gamma,
                                             epsilon)
            # TODO:shape
            # outputs.set_shape(inputs.get_shape())
        if activation_fn:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections, sc.name,
                                           outputs)
Esempio n. 34
0
    def call(self, inputs, training=False):
        # First, compute the axes along which to reduce the mean / variance,
        # as well as the broadcast shape to be used for all parameters.
        input_shape = inputs.get_shape()
        ndim = len(input_shape)
        reduction_axes = list(range(len(input_shape)))
        del reduction_axes[self.axis]
        broadcast_shape = [1] * len(input_shape)
        broadcast_shape[self.axis] = input_shape[self.axis].value

        # Determines whether broadcasting is needed.
        needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1])

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = utils.constant_value(training)

        if needs_broadcasting:
            # In this case we must explictly broadcast all parameters.
            if self.center:
                broadcast_beta = array_ops.reshape(self.beta, broadcast_shape)
            else:
                broadcast_beta = None
            if self.scale:
                broadcast_gamma = array_ops.reshape(self.gamma,
                                                    broadcast_shape)
            else:
                broadcast_gamma = None

        if training_value is not False:
            # Use a copy of moving_mean as a shift to compute more reliable moments.
            shift = math_ops.add(self.moving_mean, 0)
            if needs_broadcasting:
                shift = array_ops.reshape(shift, broadcast_shape)
                broadcast_mean, broadcast_variance = nn.moments(inputs,
                                                                reduction_axes,
                                                                shift=shift,
                                                                keep_dims=True)
                mean = array_ops.reshape(broadcast_mean, [-1])
                variance = array_ops.reshape(broadcast_variance, [-1])
            else:
                mean, variance = nn.moments(inputs,
                                            reduction_axes,
                                            shift=shift)

            # Prepare updates if necessary.
            if not self.updates:
                mean_update = moving_averages.assign_moving_average(
                    self.moving_mean, mean, self.momentum, zero_debias=False)
                variance_update = moving_averages.assign_moving_average(
                    self.moving_variance,
                    variance,
                    self.momentum,
                    zero_debias=False)
                # In the future this should be refactored into a self.add_update
                # methods in order to allow for instance-based BN layer sharing
                # across unrelated input streams (e.g. like in Keras).
                self.updates.append(mean_update)
                self.updates.append(variance_update)

        # Normalize batch. We do this inside separate functions for training
        # and inference so as to avoid evaluating both branches.
        def normalize_in_test():
            if needs_broadcasting:
                broadcast_moving_mean = array_ops.reshape(
                    self.moving_mean, broadcast_shape)
                broadcast_moving_variance = array_ops.reshape(
                    self.moving_variance, broadcast_shape)
                return nn.batch_normalization(inputs, broadcast_moving_mean,
                                              broadcast_moving_variance,
                                              broadcast_beta, broadcast_gamma,
                                              self.epsilon)
            else:
                return nn.batch_normalization(
                    inputs, self.moving_mean, self.moving_variance,
                    self.beta if self.center else None,
                    self.gamma if self.scale else None, self.epsilon)

        def normalize_in_training():
            if needs_broadcasting:
                return nn.batch_normalization(inputs, broadcast_mean,
                                              broadcast_variance,
                                              broadcast_beta, broadcast_gamma,
                                              self.epsilon)
            else:
                return nn.batch_normalization(
                    inputs, mean, variance, self.beta if self.center else None,
                    self.gamma if self.scale else None, self.epsilon)

        return utils.smart_cond(training, normalize_in_training,
                                normalize_in_test)
Esempio n. 35
0
def moving_average_update(variable, value, momentum):
    return moving_averages.assign_moving_average(
        variable, value, momentum)
Esempio n. 36
0
 def update_fn(v, value):
   v.assign_add(value)
   moving_averages.assign_moving_average(var2, [2.0, 4.0], decay=0.25)
   moving_averages.assign_moving_average(
       var3, [2.0, 4.0], decay=0.25, zero_debias=False)
Esempio n. 37
0
def batch_norm(inputs,
               decay=0.999,
               center=True,
               scale=False,
               epsilon=0.001,
               moving_vars='moving_vars',
               activation_fn=None,
               is_training=True,
               data_format='NHWC',
               reuse=None,
               num_shards=None,
               distributed_group_size=1,
               scope=None):
  """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.

    "Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift"

    Sergey Ioffe, Christian Szegedy

  Can be used as a normalizer function for conv2d and fully_connected.

  Note: When is_training is True the moving_mean and moving_variance need to be
  updated, by default the update_ops are placed in `tf.GraphKeys.UPDATE_OPS` so
  they need to be added as a dependency to the `train_op`, example:

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    if update_ops:
      updates = tf.group(*update_ops)
      total_loss = control_flow_ops.with_dependencies([updates], total_loss)

  One can set updates_collections=None to force the updates in place, but that
  can have speed penalty, especially in distributed settings.

  Args:
    inputs: A tensor with 2 or more dimensions, where the first dimension has
      `batch_size`. The normalization is over all but the last dimension if
      `data_format` is `NHWC` and the second dimension if `data_format` is
      `NCHW`.
    decay: Decay for the moving average. Reasonable values for `decay` are close
      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
      Lower `decay` value (recommend trying `decay`=0.9) if model experiences
      reasonably good training performance but poor validation and/or test
      performance.
    center: If True, add offset of `beta` to normalized tensor.  If False,
      `beta` is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: Small float added to variance to avoid dividing by zero.
    moving_vars: Name of collection created for moving variables.
    activation_fn: Activation function, default set to None to skip it and
      maintain a linear activation.
    is_training: Whether or not the layer is in training mode. In training mode
      it would accumulate the statistics of the moments into `moving_mean` and
      `moving_variance` using an exponential moving average with the given
      `decay`. When it is not in training mode then it would use the values of
      the `moving_mean` and the `moving_variance`.
    data_format: input data format. NHWC or NCHW
    reuse: Whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    num_shards: Number of shards that participate in the global
      reduction. Default is set to None, that will skip the cross replica sum in
      and normalize across local examples only.
    distributed_group_size: Number of replicas to normalize across in the
      distributed batch normalization.
    scope: Optional scope for `variable_scope`.

  Returns:
    A `Tensor` representing the output of the operation.

  Raises:
    ValueError: If the rank of `inputs` is undefined.
    ValueError: If the rank of `inputs` is neither 2 or 4.
    ValueError: If rank or `C` dimension of `inputs` is undefined.
  """
  trainable = True

  with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse):
    inputs = tf.convert_to_tensor(inputs)
    original_shape = inputs.get_shape()
    original_rank = original_shape.ndims
    if original_rank is None:
      raise ValueError('Inputs %s has undefined rank' % inputs.name)
    elif original_rank not in [2, 4]:
      raise ValueError('Inputs %s has unsupported rank.'
                       ' Expected 2 or 4 but got %d' % (inputs.name,
                                                        original_rank))
    if original_rank == 2:
      channels = inputs.get_shape()[-1].value
      if channels is None:
        raise ValueError('`C` dimension must be known but is None')
      new_shape = [-1, 1, 1, channels]
      if data_format == 'NCHW':
        new_shape = [-1, channels, 1, 1]
      inputs = tf.reshape(inputs, new_shape)
    inputs_shape = inputs.get_shape()
    if data_format == 'NHWC':
      params_shape = inputs_shape[-1:]
    else:
      params_shape = inputs_shape[1:2]
    if not params_shape.is_fully_defined():
      raise ValueError('Inputs %s has undefined `C` dimension %s.' %
                       (inputs.name, params_shape))

    # Allocate parameters for the beta and gamma of the normalization.
    trainable_beta = trainable and center
    collections = [tf.GraphKeys.MODEL_VARIABLES, tf.GraphKeys.GLOBAL_VARIABLES]
    beta = tf.contrib.framework.variable(
        'beta',
        params_shape,
        collections=collections,
        initializer=tf.zeros_initializer(),
        trainable=trainable_beta)
    trainable_gamma = trainable and scale
    gamma = tf.contrib.framework.variable(
        'gamma',
        params_shape,
        collections=collections,
        initializer=tf.ones_initializer(),
        trainable=trainable_gamma)

    # Create moving_mean and moving_variance variables and add them to the
    # appropiate collections.
    moving_collections = [moving_vars,
                          tf.GraphKeys.MOVING_AVERAGE_VARIABLES,
                          tf.GraphKeys.MODEL_VARIABLES,
                          tf.GraphKeys.GLOBAL_VARIABLES]
    # Disable partition setting for moving_mean and moving_variance
    # as assign_moving_average op below doesn't support partitioned variable.
    scope = tf.get_variable_scope()
    partitioner = scope.partitioner
    scope.set_partitioner(None)
    moving_mean = tf.contrib.framework.variable(
        'moving_mean',
        params_shape,
        initializer=tf.zeros_initializer(),
        trainable=False,
        collections=moving_collections)
    moving_variance = tf.contrib.framework.variable(
        'moving_variance',
        params_shape,
        initializer=tf.ones_initializer(),
        trainable=False,
        collections=moving_collections)
    # Restore scope's partitioner setting.
    scope.set_partitioner(partitioner)

    # Add cross replica sum to do subset mean and variance calculation
    # First compute mean and variance
    if is_training:
      if distributed_group_size > 1:
        # Execute a distributed batch normalization
        if data_format == 'NCHW':
          axis = 1
        else:
          axis = 3
        input_shape = inputs.get_shape()
        inputs_dtype = inputs.dtype
        inputs = tf.cast(inputs, tf.float32)
        ndims = len(input_shape)
        reduction_axes = [i for i in range(ndims) if i != axis]
        counts, mean_ss, variance_ss, _ = tf.nn.sufficient_statistics(
            inputs, reduction_axes, keep_dims=False)
        mean_ss = cross_replica_average(mean_ss, num_shards,
                                        distributed_group_size)
        variance_ss = cross_replica_average(variance_ss, num_shards,
                                            distributed_group_size)
        mean, variance = tf.nn.normalize_moments(
            counts, mean_ss, variance_ss, shift=None)
        outputs = tf.nn.batch_normalization(inputs, mean, variance, beta, gamma,
                                            epsilon)
        outputs = tf.cast(outputs, inputs_dtype)
      else:
        outputs, mean, variance = tf.nn.fused_batch_norm(
            inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
    else:
      outputs, mean, variance = tf.nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          mean=moving_mean,
          variance=moving_variance,
          epsilon=epsilon,
          is_training=False,
          data_format=data_format)

    if is_training:
      update_moving_mean = moving_averages.assign_moving_average(
          moving_mean,
          tf.cast(mean, moving_mean.dtype),
          decay,
          zero_debias=False)
      update_moving_variance = moving_averages.assign_moving_average(
          moving_variance,
          tf.cast(variance, moving_variance.dtype),
          decay,
          zero_debias=False)
      tf.add_to_collection('update_ops', update_moving_mean)
      tf.add_to_collection('update_ops', update_moving_variance)

    outputs.set_shape(inputs_shape)
    if original_shape.ndims == 2:
      outputs = tf.reshape(outputs, original_shape)
    if activation_fn is not None:
      outputs = activation_fn(outputs)
    return outputs
Esempio n. 38
0
def moving_average_update(variable, value, momentum):
  try:
    return moving_averages.assign_moving_average(
        variable, value, momentum, zero_debias=False)
  except TypeError:
    return moving_averages.assign_moving_average(variable, value, momentum)
Esempio n. 39
0
    def __init__(
            self,
            prev_layer,
            n_units=100,
            act=None,
            decay=0.9,
            epsilon=1e-5,
            is_train=False,
            bitW=8,
            bitA=8,
            gamma_init=tf.compat.v1.initializers.ones,
            beta_init=tf.compat.v1.initializers.zeros,
            use_gemm=False,
            W_init=tf.compat.v1.initializers.truncated_normal(stddev=0.05),
            W_init_args=None,
            name=None,  #'quan_dense_with_bn',
    ):
        super(QuanDenseLayerWithBN, self).__init__(prev_layer=prev_layer,
                                                   act=act,
                                                   W_init_args=W_init_args,
                                                   name=name)

        logging.info(
            "QuanDenseLayerWithBN  %s: %d %s" %
            (self.name, n_units,
             self.act.__name__ if self.act is not None else 'No Activation'))

        if self.inputs.get_shape().ndims != 2:
            raise Exception(
                "The input dimension must be rank 2, please reshape or flatten it"
            )

        if use_gemm:
            raise Exception(
                "TODO. The current version use tf.matmul for inferencing.")

        n_in = int(self.inputs.get_shape()[-1])
        x = self.inputs
        self.inputs = quantize_active_overflow(self.inputs, bitA)
        self.n_units = n_units

        with tf.compat.v1.variable_scope(name):

            W = tf.compat.v1.get_variable(name='W',
                                          shape=(n_in, n_units),
                                          initializer=W_init,
                                          dtype=LayersConfig.tf_dtype,
                                          **self.W_init_args)

            mid_out = tf.matmul(x, W)

            para_bn_shape = mid_out.get_shape()[-1:]

            if gamma_init:
                scale_para = tf.compat.v1.get_variable(
                    name='scale_para',
                    shape=para_bn_shape,
                    initializer=gamma_init,
                    dtype=LayersConfig.tf_dtype,
                    trainable=is_train)
            else:
                scale_para = None

            if beta_init:
                offset_para = tf.compat.v1.get_variable(
                    name='offset_para',
                    shape=para_bn_shape,
                    initializer=beta_init,
                    dtype=LayersConfig.tf_dtype,
                    trainable=is_train)
            else:
                offset_para = None

            moving_mean = tf.compat.v1.get_variable(
                'moving_mean',
                para_bn_shape,
                initializer=tf.compat.v1.initializers.constant(1.),
                dtype=LayersConfig.tf_dtype,
                trainable=False)

            moving_variance = tf.compat.v1.get_variable(
                'moving_variance',
                para_bn_shape,
                initializer=tf.compat.v1.initializers.constant(1.),
                dtype=LayersConfig.tf_dtype,
                trainable=False,
            )

            mean, variance = tf.nn.moments(
                x=mid_out, axes=list(range(len(mid_out.get_shape()) - 1)))

            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay,
                zero_debias=False)  # if zero_debias=True, has bias

            update_moving_variance = moving_averages.assign_moving_average(
                moving_variance, variance, decay,
                zero_debias=False)  # if zero_debias=True, has bias

            def mean_var_with_update():
                with tf.control_dependencies(
                    [update_moving_mean, update_moving_variance]):
                    return tf.identity(mean), tf.identity(variance)

            if is_train:
                mean, var = mean_var_with_update()
            else:
                mean, var = moving_mean, moving_variance

            w_fold = _w_fold(W, scale_para, var, epsilon)
            bias_fold = _bias_fold(offset_para, scale_para, mean, var, epsilon)

            W = quantize_weight_overflow(w_fold, bitW)
            # W = tl.act.sign(W)    # dont update ...

            # W = tf.Variable(W)

            self.outputs = tf.matmul(self.inputs, W)
            # self.outputs = xnor_gemm(self.inputs, W) # TODO

            self.outputs = tf.nn.bias_add(self.outputs,
                                          bias_fold,
                                          name='bias_add')

            self.outputs = self._apply_activation(self.outputs)

        self._add_layers(self.outputs)

        self._add_params(
            [W, scale_para, offset_para, moving_mean, moving_variance])
Esempio n. 40
0
def batch_norm2d(inputs,
                 is_training=True,
                 eps=1e-05,
                 decay=0.9,
                 affine=True,
                 force_update=False,
                 name=None):
    """
    Do channel-wise batch normalization
    :param inputs: print(shape1, shape2)
    :param is_training: bool var indicating mode
    :param eps: for stabilize
    :param decay: momentum factor
    :param affine: whether scale & offset
    :param name: var_scope & operation name
    :return: batch_norm output
    """
    with tf.variable_scope(name, default_name='BatchNorm2d'):
        params_shape = tensor_shape(inputs)[-1:]
        moving_mean = tf.get_variable('mean',
                                      params_shape,
                                      initializer=tf.zeros_initializer,
                                      trainable=False)
        moving_variance = tf.get_variable('variance',
                                          params_shape,
                                          initializer=tf.ones_initializer,
                                          trainable=False)

        # mean_var_with_update is deprecated !
        # tf.nn.moments is computing the sample variance,
        # whereas tf.nn.fused_batch_norm is computing the unbiased variance estimator.
        # The difference between the two is a factor n/n-1
        # def mean_var_with_update():
        #     # update moving_moments
        #     axes = list(np.arange(len(inputs.get_shape()) - 1))
        #     mean, variance = tf.nn.moments(inputs, axes, name='moments')
        #     with tf.control_dependencies([assign_moving_average(moving_mean, mean, decay, zero_debias=False),
        #                                   assign_moving_average(moving_variance, variance, decay, zero_debias=False)]):
        #         # https://stackoverflow.com/questions/34877523/in-tensorflow-what-is-tf-identity-used-for
        #         return tf.identity(mean), tf.identity(variance)

        if affine:
            beta = tf.get_variable('beta',
                                   params_shape,
                                   initializer=tf.zeros_initializer,
                                   collections=BN_COLLECTIONS)
            gamma = tf.get_variable('gamma',
                                    params_shape,
                                    initializer=tf.ones_initializer,
                                    collections=BN_COLLECTIONS)
        else:
            gamma = tf.constant(value=np.ones(params_shape, dtype=np.float32))
            beta = tf.constant(value=np.zeros(params_shape, dtype=np.float32))

        def training_mode():
            outputs, batch_mean, batch_var = tf.nn.fused_batch_norm(
                inputs, gamma, beta, epsilon=eps)
            return outputs, batch_mean, batch_var

        def inference_mode():
            outputs, batch_mean, batch_var = tf.nn.fused_batch_norm(
                inputs,
                gamma,
                beta,
                moving_mean,
                moving_variance,
                epsilon=eps,
                is_training=False)
            return outputs, batch_mean, batch_var

        outputs, batch_mean, batch_var = tf.cond(tf.constant(is_training),
                                                 training_mode, inference_mode)

        if is_training:
            tf.add_to_collection(
                tf.GraphKeys.UPDATE_OPS,
                assign_moving_average(moving_mean,
                                      batch_mean,
                                      decay,
                                      zero_debias=False))
            tf.add_to_collection(
                tf.GraphKeys.UPDATE_OPS,
                assign_moving_average(moving_variance,
                                      batch_var,
                                      decay,
                                      zero_debias=False))

        return outputs
Esempio n. 41
0
    def _batch_norm(self, name, x):
        """Batch normalization."""
        with tf.variable_scope(name):
            params_shape = [x.get_shape()[-1]]

            beta = tf.get_variable('beta',
                                   params_shape,
                                   tf.float32,
                                   initializer=tf.constant_initializer(
                                       0.0, tf.float32))
            gamma = tf.get_variable('gamma',
                                    params_shape,
                                    tf.float32,
                                    initializer=tf.constant_initializer(
                                        1.0, tf.float32))

            if self.mode == 'train':
                # is this valid only for one batch??
                mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments')

                moving_mean = tf.get_variable(
                    'moving_mean',
                    params_shape,
                    tf.float32,
                    initializer=tf.constant_initializer(0.0, tf.float32),
                    trainable=False)
                moving_variance = tf.get_variable(
                    'moving_variance',
                    params_shape,
                    tf.float32,
                    initializer=tf.constant_initializer(1.0, tf.float32),
                    trainable=False)

                self._extra_train_ops.append(
                    moving_averages.assign_moving_average(
                        moving_mean, mean, 0.9))
                self._extra_train_ops.append(
                    moving_averages.assign_moving_average(
                        moving_variance, variance, 0.9))

            # this is 'eval' mode??
            else:
                mean = tf.get_variable('moving_mean',
                                       params_shape,
                                       tf.float32,
                                       initializer=tf.constant_initializer(
                                           0.0, tf.float32),
                                       trainable=False)
                variance = tf.get_variable('moving_variance',
                                           params_shape,
                                           tf.float32,
                                           initializer=tf.constant_initializer(
                                               1.0, tf.float32),
                                           trainable=False)
                tf.summary.histogram(mean.op.name, mean)
                tf.summary.histogram(variance.op.name, variance)
            # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net.
            y = tf.nn.batch_normalization(x, mean, variance, beta, gamma,
                                          0.001)
            y.set_shape(x.get_shape())
            return y
Esempio n. 42
0
    def __init__(
        self,
        prev_layer,
        n_filter=32,
        filter_size=(3, 3),
        strides=(1, 1),
        padding='SAME',
        act=None,
        decay=0.9,
        epsilon=1e-5,
        is_train=False,
        gamma_init=tf.compat.v1.initializers.ones,
        beta_init=tf.compat.v1.initializers.zeros,
        bitW=8,
        bitA=8,
        use_gemm=False,
        W_init=tf.compat.v1.initializers.truncated_normal(stddev=0.02),
        W_init_args=None,
        use_cudnn_on_gpu=None,
        data_format=None,
        name='quan_cnn2d_bn',
    ):
        super(QuanConv2dWithBN, self).__init__(prev_layer=prev_layer,
                                               act=act,
                                               W_init_args=W_init_args,
                                               name=name)

        logging.info(
            "QuanConv2dWithBN %s: n_filter: %d filter_size: %s strides: %s pad: %s act: %s "
            % (self.name, n_filter, filter_size, str(strides), padding,
               self.act.__name__ if self.act is not None else 'No Activation'))

        x = self.inputs
        self.inputs = quantize_active_overflow(self.inputs,
                                               bitA)  # Do not remove

        if use_gemm:
            raise Exception(
                "TODO. The current version use tf.matmul for inferencing.")

        if len(strides) != 2:
            raise ValueError("len(strides) should be 2.")

        try:
            pre_channel = int(prev_layer.outputs.get_shape()[-1])
        except Exception:  # if pre_channel is ?, it happens when using Spatial Transformer Net
            pre_channel = 1
            logging.warning("[warnings] unknow input channels, set to 1")

        shape = (filter_size[0], filter_size[1], pre_channel, n_filter)
        strides = (1, strides[0], strides[1], 1)

        with tf.compat.v1.variable_scope(name):
            W = tf.compat.v1.get_variable(name='W_conv2d',
                                          shape=shape,
                                          initializer=W_init,
                                          dtype=LayersConfig.tf_dtype,
                                          **self.W_init_args)

            conv = tf.nn.conv2d(x,
                                W,
                                strides=strides,
                                padding=padding,
                                use_cudnn_on_gpu=use_cudnn_on_gpu,
                                data_format=data_format)

            para_bn_shape = conv.get_shape()[-1:]

            if gamma_init:
                scale_para = tf.compat.v1.get_variable(
                    name='scale_para',
                    shape=para_bn_shape,
                    initializer=gamma_init,
                    dtype=LayersConfig.tf_dtype,
                    trainable=is_train)
            else:
                scale_para = None

            if beta_init:
                offset_para = tf.compat.v1.get_variable(
                    name='offset_para',
                    shape=para_bn_shape,
                    initializer=beta_init,
                    dtype=LayersConfig.tf_dtype,
                    trainable=is_train)
            else:
                offset_para = None

            moving_mean = tf.compat.v1.get_variable(
                'moving_mean',
                para_bn_shape,
                initializer=tf.compat.v1.initializers.constant(1.),
                dtype=LayersConfig.tf_dtype,
                trainable=False)

            moving_variance = tf.compat.v1.get_variable(
                'moving_variance',
                para_bn_shape,
                initializer=tf.compat.v1.initializers.constant(1.),
                dtype=LayersConfig.tf_dtype,
                trainable=False,
            )

            mean, variance = tf.nn.moments(
                x=conv, axes=list(range(len(conv.get_shape()) - 1)))

            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay,
                zero_debias=False)  # if zero_debias=True, has bias

            update_moving_variance = moving_averages.assign_moving_average(
                moving_variance, variance, decay,
                zero_debias=False)  # if zero_debias=True, has bias

            def mean_var_with_update():
                with tf.control_dependencies(
                    [update_moving_mean, update_moving_variance]):
                    return tf.identity(mean), tf.identity(variance)

            if is_train:
                mean, var = mean_var_with_update()
            else:
                mean, var = moving_mean, moving_variance

            w_fold = _w_fold(W, scale_para, var, epsilon)
            bias_fold = _bias_fold(offset_para, scale_para, mean, var, epsilon)

            W = quantize_weight_overflow(w_fold, bitW)

            conv_fold = tf.nn.conv2d(self.inputs,
                                     W,
                                     strides=strides,
                                     padding=padding,
                                     use_cudnn_on_gpu=use_cudnn_on_gpu,
                                     data_format=data_format)

            self.outputs = tf.nn.bias_add(conv_fold,
                                          bias_fold,
                                          name='bn_bias_add')

            self.outputs = self._apply_activation(self.outputs)

        self._add_layers(self.outputs)

        self._add_params(
            [W, scale_para, offset_para, moving_mean, moving_variance])
Esempio n. 43
0
    def relu(self, inputs, init_x=None):
        """Construct a relu/relu_x layer on top of cnn."""
        if ((not self.params.use_relu_x)
                or self.params.last_act_name in tf.get_variable_scope().name):
            return tf.nn.relu(inputs)

        if self.params.relu_x_per_channel:
            if self.params.data_format == 'NCHW':
                inputs = tf.transpose(inputs, [0, 2, 3, 1])
            shape = [inputs.get_shape()[3]]
            reduce_dim = [0, 1, 2]
        else:
            shape = []
            reduce_dim = None

        if init_x is None:
            init_x = self.params.init_relu_x
        with tf.variable_scope('relu_x'):
            act = inputs
            trainable_x = self.params.relu_x_update == 'gradient_descent'
            x = tf.get_variable('x',
                                shape,
                                tf.float32,
                                initializer=tf.constant_initializer(init_x),
                                trainable=trainable_x)
            if self.params.relu_x_update == 'moving_average':
                act = tf.maximum(tf.minimum(inputs, init_x), 0)
                batch_max = tf.reduce_max(act,
                                          axis=reduce_dim,
                                          name='BatchMax')
                x = moving_averages.assign_moving_average(x,
                                                          tf.cast(
                                                              batch_max,
                                                              tf.float32),
                                                          0.999,
                                                          zero_debias=False,
                                                          name='MovingAvgX')
            x = tf.cast(x, self.dtype)
            if self.params.relu_x_per_channel:
                act = tf.maximum(tf.minimum(act, tf.reshape(x, [1, 1, 1, -1])),
                                 0)
            else:
                act = tf.maximum(tf.minimum(act, x), 0)
            if self.params.quant_act:
                print('Quantizing activation %s' % act.name)
                if self.params.relu_x_per_channel:
                    zeros = tf.constant(0, dtype=tf.float32, shape=shape)
                else:
                    zeros = 0
                act = self.delayed_quant(
                    act,
                    zeros,
                    x,
                    per_channel=self.params.relu_x_per_channel,
                    num_bits=self.params.quant_act_bits,
                    narrow_range=False,
                    quant_delay=self.params.quant_act_delay)
            if self.params.relu_x_per_channel and self.params.data_format == 'NCHW':
                act = tf.transpose(act, [0, 3, 1, 2])

        return act
Esempio n. 44
0
def batch_norm(inputs,
               decay=0.999,
               center=True,
               scale=False,
               epsilon=0.001,
               moving_vars='moving_vars',
               activation=None,
               is_training=True,
               trainable=True,
               restore=True,
               scope=None,
               reuse=None):
  """Adds a Batch Normalization layer.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels]
            or [batch_size, channels].
    decay: decay for the moving average.
    center: If True, subtract beta. If False, beta is not created and ignored.
    scale: If True, multiply by gamma. If False, gamma is
      not used. When the next layer is linear (also e.g. ReLU), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: small float added to variance to avoid dividing by zero.
    moving_vars: collection to store the moving_mean and moving_variance.
    activation: activation function.
    is_training: whether or not the model is in training mode.
    trainable: whether or not the variables should be trainable or not.
    restore: whether or not the variables should be marked for restore.
    scope: Optional scope for variable_scope.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.

  Returns:
    a tensor representing the output of the operation.

  """
  inputs_shape = inputs.get_shape()
  with tf.variable_scope(scope, 'BatchNorm', [inputs], reuse=reuse):
    axis = list(range(len(inputs_shape) - 1))
    params_shape = inputs_shape[-1:]
    # Allocate parameters for the beta and gamma of the normalization.
    beta, gamma = None, None
    if center:
      beta = variables.variable('beta',
                                params_shape,
                                initializer=tf.zeros_initializer(),
                                trainable=trainable,
                                restore=restore)
    if scale:
      gamma = variables.variable('gamma',
                                 params_shape,
                                 initializer=tf.ones_initializer(),
                                 trainable=trainable,
                                 restore=restore)
    # Create moving_mean and moving_variance add them to
    # GraphKeys.MOVING_AVERAGE_VARIABLES collections.
    moving_collections = [moving_vars, tf.GraphKeys.MOVING_AVERAGE_VARIABLES]
    moving_mean = variables.variable('moving_mean',
                                     params_shape,
                                     initializer=tf.zeros_initializer(),
                                     trainable=False,
                                     restore=restore,
                                     collections=moving_collections)
    moving_variance = variables.variable('moving_variance',
                                         params_shape,
                                         initializer=tf.ones_initializer(),
                                         trainable=False,
                                         restore=restore,
                                         collections=moving_collections)
    if is_training:
      # Calculate the moments based on the individual batch.
      mean, variance = tf.nn.moments(inputs, axis)

      update_moving_mean = moving_averages.assign_moving_average(
          moving_mean, mean, decay)
      tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_mean)
      update_moving_variance = moving_averages.assign_moving_average(
          moving_variance, variance, decay)
      tf.add_to_collection(UPDATE_OPS_COLLECTION, update_moving_variance)
    else:
      # Just use the moving_mean and moving_variance.
      mean = moving_mean
      variance = moving_variance
    # Normalize the activations.
    outputs = tf.nn.batch_normalization(
        inputs, mean, variance, beta, gamma, epsilon)
    outputs.set_shape(inputs.get_shape())
    if activation:
      outputs = activation(outputs)
    return outputs
Esempio n. 45
0
def batch_norm(inputs,
               decay=0.999,
               center=True,
               scale=False,
               epsilon=0.001,
               updates_collections=ops.GraphKeys.UPDATE_OPS,
               is_training=True,
               reuse=None,
               variables_collections=None,
               outputs_collections=None,
               trainable=True,
               scope=None):
    """Code modification of tensorflow/contrib/layers/python/layers/layers.py
  """
    with variable_scope.variable_op_scope([inputs],
                                          scope,
                                          'BatchNorm',
                                          reuse=reuse) as sc:
        inputs = ops.convert_to_tensor(inputs)
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        axis = list(range(inputs_rank - 1))
        params_shape = inputs_shape[-1:]
        if not params_shape.is_fully_defined():
            raise ValueError('Inputs %s has undefined last dimension %s.' %
                             (inputs.name, params_shape))
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta = variables.model_variable(
                'beta',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.zeros_initializer,
                collections=beta_collections,
                trainable=trainable)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma = variables.model_variable(
                'gamma',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.ones_initializer,
                collections=gamma_collections,
                trainable=trainable)
        # Create moving_mean and moving_variance variables and add them to the
        # appropiate collections.
        moving_mean_collections = utils.get_variable_collections(
            variables_collections, 'moving_mean')
        moving_mean = variables.model_variable(
            'moving_mean',
            shape=params_shape,
            dtype=dtype,
            initializer=init_ops.zeros_initializer,
            trainable=False,
            collections=moving_mean_collections)
        moving_variance_collections = utils.get_variable_collections(
            variables_collections, 'moving_variance')
        moving_variance = variables.model_variable(
            'moving_variance',
            shape=params_shape,
            dtype=dtype,
            initializer=init_ops.ones_initializer,
            trainable=False,
            collections=moving_variance_collections)

        # Calculate the moments based on the individual batch.
        mean, variance = nn.moments(inputs, axis, shift=moving_mean)
        # Update the moving_mean and moving_variance moments.
        update_moving_mean = moving_averages.assign_moving_average(
            moving_mean, mean, decay)
        update_moving_variance = moving_averages.assign_moving_average(
            moving_variance, variance, decay)
        if updates_collections is None:
            # Make sure the updates are computed here.
            with ops.control_dependencies(
                [update_moving_mean, update_moving_variance]):
                outputs = nn.batch_normalization(inputs, mean, variance, beta,
                                                 gamma, epsilon)
        else:
            # Collect the updates to be computed later.
            ops.add_to_collections(updates_collections, update_moving_mean)
            ops.add_to_collections(updates_collections, update_moving_variance)
            outputs = nn.batch_normalization(inputs, mean, variance, beta,
                                             gamma, epsilon)

        test_outputs = nn.batch_normalization(inputs, moving_mean,
                                              moving_variance, beta, gamma,
                                              epsilon)

        outputs = tf.cond(is_training, lambda: outputs, lambda: test_outputs)
        outputs.set_shape(inputs_shape)

        return utils.collect_named_outputs(outputs_collections, sc.name,
                                           outputs)
Esempio n. 46
0
    def __init__(
        self,
        layer=None,
        decay=0.9,
        epsilon=2e-5,
        act=tf.identity,
        is_train=False,
        fix_gamma=True,
        beta_init=tf.zeros_initializer,
        gamma_init=tf.random_normal_initializer(
            mean=1.0, stddev=0.002),  # tf.ones_initializer,
        # dtype = tf.float32,
        trainable=None,
        name='batchnorm_layer',
    ):
        #Layer.__init__(self, name=name)
        super(Layer, self).__init__()
        self.name = name
        self.inputs = layer.outputs
        print(
            "  [TL] BatchNormLayer %s: decay:%f epsilon:%f act:%s is_train:%s"
            % (self.name, decay, epsilon, act.__name__, is_train))
        x_shape = self.inputs.get_shape()
        params_shape = x_shape[-1:]

        from tensorflow.python.training import moving_averages
        from tensorflow.python.ops import control_flow_ops

        with tf.variable_scope(name) as vs:
            axis = list(range(len(x_shape) - 1))

            ## 1. beta, gamma
            if tf.__version__ > '0.12.1' and beta_init == tf.zeros_initializer:
                beta_init = beta_init()
            beta = tf.get_variable('beta',
                                   shape=params_shape,
                                   initializer=beta_init,
                                   dtype=tf.float32,
                                   trainable=is_train)  #, restore=restore)

            gamma = tf.get_variable(
                'gamma',
                shape=params_shape,
                initializer=gamma_init,
                dtype=tf.float32,
                trainable=fix_gamma,
            )  #restore=restore)

            ## 2.
            if tf.__version__ > '0.12.1':
                moving_mean_init = tf.zeros_initializer()
            else:
                moving_mean_init = tf.zeros_initializer
            moving_mean = tf.get_variable(
                'moving_mean',
                params_shape,
                initializer=moving_mean_init,
                dtype=tf.float32,
                trainable=False)  #   restore=restore)
            moving_variance = tf.get_variable(
                'moving_variance',
                params_shape,
                initializer=tf.constant_initializer(1.),
                dtype=tf.float32,
                trainable=False,
            )  #   restore=restore)

            ## 3.
            # These ops will only be preformed when training.
            mean, variance = tf.nn.moments(self.inputs, axis)
            try:  # TF12
                update_moving_mean = moving_averages.assign_moving_average(
                    moving_mean, mean, decay,
                    zero_debias=False)  # if zero_debias=True, has bias
                update_moving_variance = moving_averages.assign_moving_average(
                    moving_variance, variance, decay,
                    zero_debias=False)  # if zero_debias=True, has bias
                # print("TF12 moving")
            except Exception as e:  # TF11
                update_moving_mean = moving_averages.assign_moving_average(
                    moving_mean, mean, decay)
                update_moving_variance = moving_averages.assign_moving_average(
                    moving_variance, variance, decay)
                # print("TF11 moving")

            def mean_var_with_update():
                with tf.control_dependencies(
                    [update_moving_mean, update_moving_variance]):
                    return tf.identity(mean), tf.identity(variance)

            if trainable:
                mean, var = mean_var_with_update()
                print(mean)
                print(var)
                self.outputs = act(
                    tf.nn.batch_normalization(self.inputs, mean, var, beta,
                                              gamma, epsilon))
            else:
                self.outputs = act(
                    tf.nn.batch_normalization(self.inputs, moving_mean,
                                              moving_variance, beta, gamma,
                                              epsilon))
            variables = [beta, gamma, moving_mean, moving_variance]
        self.all_layers = list(layer.all_layers)
        self.all_params = list(layer.all_params)
        self.all_drop = dict(layer.all_drop)
        self.all_layers.extend([self.outputs])
        self.all_params.extend(variables)
Esempio n. 47
0
 def _do_update(var, value):
     return moving_averages.assign_moving_average(var,
                                                  value,
                                                  self.momentum,
                                                  zero_debias=False)
Esempio n. 48
0
    def __init__(
            self,
            prev_layer,
            decay=0.9,
            epsilon=0.00001,
            act=None,
            is_train=False,
            beta_init=tf.zeros_initializer,
            gamma_init=tf.random_normal_initializer(mean=1.0, stddev=0.002),
            moving_mean_init=tf.zeros_initializer(),
            name='batchnorm_layer',
    ):
        super(BatchNormLayer, self).__init__(prev_layer=prev_layer, act=act, name=name)

        logging.info(
            "BatchNormLayer %s: decay: %f epsilon: %f act: %s is_train: %s" %
            (self.name, decay, epsilon, self.act.__name__ if self.act is not None else 'No Activation', is_train)
        )

        x_shape = self.inputs.get_shape()
        params_shape = x_shape[-1:]

        with tf.variable_scope(name):
            axis = list(range(len(x_shape) - 1))
            # 1. beta, gamma
            variables = []

            if beta_init:

                if beta_init == tf.zeros_initializer:
                    beta_init = beta_init()

                beta = tf.get_variable(
                    'beta', shape=params_shape, initializer=beta_init, dtype=LayersConfig.tf_dtype, trainable=is_train
                )

                variables.append(beta)

            else:
                beta = None

            if gamma_init:
                gamma = tf.get_variable(
                    'gamma',
                    shape=params_shape,
                    initializer=gamma_init,
                    dtype=LayersConfig.tf_dtype,
                    trainable=is_train,
                )
                variables.append(gamma)
            else:
                gamma = None

            # 2.

            moving_mean = tf.get_variable(
                'moving_mean', params_shape, initializer=moving_mean_init, dtype=LayersConfig.tf_dtype, trainable=False
            )

            moving_variance = tf.get_variable(
                'moving_variance',
                params_shape,
                initializer=tf.constant_initializer(1.),
                dtype=LayersConfig.tf_dtype,
                trainable=False,
            )

            # 3.
            # These ops will only be preformed when training.
            mean, variance = tf.nn.moments(self.inputs, axis)

            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay, zero_debias=False
            )  # if zero_debias=True, has bias

            update_moving_variance = moving_averages.assign_moving_average(
                moving_variance, variance, decay, zero_debias=False
            )  # if zero_debias=True, has bias

            def mean_var_with_update():
                with tf.control_dependencies([update_moving_mean, update_moving_variance]):
                    return tf.identity(mean), tf.identity(variance)

            if is_train:
                mean, var = mean_var_with_update()
            else:
                mean, var = moving_mean, moving_variance

            self.outputs = self._apply_activation(
                tf.nn.batch_normalization(self.inputs, mean, var, beta, gamma, epsilon)
            )

            variables.extend([moving_mean, moving_variance])

        self._add_layers(self.outputs)
        self._add_params(variables)
Esempio n. 49
0
    def batch_norm_layer(self,
                         x,
                         scope,
                         is_training,
                         epsilon=0.001,
                         decay=0.99,
                         reuse=None):
        """
        Performs a batch normalization layer

        Args:
            x: input tensor
            scope: scope name
            is_training: python boolean value
            epsilon: the variance epsilon - a small float number to avoid dividing by 0
            decay: the moving average decay

        Returns:
            The ops of a batch normalization layer
        """
        with tf.variable_scope(scope, reuse=reuse):
            shape = x.get_shape().as_list()
            # gamma: a trainable scale factor
            gamma = tf.get_variable("gamma",
                                    shape[-1:],
                                    initializer=tf.constant_initializer(1.0),
                                    trainable=True)
            # beta: a trainable shift value
            beta = tf.get_variable("beta",
                                   1,
                                   initializer=tf.constant_initializer(0.0),
                                   trainable=True)
            moving_avg = tf.get_variable(
                "moving_avg",
                shape[-1:],
                initializer=tf.constant_initializer(0.0),
                trainable=False)
            moving_var = tf.get_variable(
                "moving_var",
                shape[-1:],
                initializer=tf.constant_initializer(1.0),
                trainable=False)
            if is_training:
                # tf.nn.moments == Calculate the mean and the variance of the tensor x
                avg, var = tf.nn.moments(x, list(range(len(shape) - 1)))
                avg = tf.cast(avg, tf.float32)
                var = tf.cast(var, tf.float32)
                update_moving_avg = moving_averages.assign_moving_average(
                    moving_avg, avg, decay)
                update_moving_var = moving_averages.assign_moving_average(
                    moving_var, var, decay)
                control_inputs = [update_moving_avg, update_moving_var]
            else:
                avg = moving_avg
                var = moving_var
                control_inputs = []
            with tf.control_dependencies(control_inputs):
                output = tf.nn.batch_normalization(x,
                                                   avg,
                                                   var,
                                                   offset=beta,
                                                   scale=gamma,
                                                   variance_epsilon=epsilon)

        return output
Esempio n. 50
0
def weighted_resample(inputs,
                      weights,
                      overall_rate,
                      scope=None,
                      mean_decay=0.999,
                      warmup=10,
                      seed=None):
    """Performs an approximate weighted resampling of `inputs`.

  This method chooses elements from `inputs` where each item's rate of
  selection is proportional to its value in `weights`, and the average
  rate of selection across all inputs (and many invocations!) is
  `overall_rate`.

  Args:
    inputs: A list of tensors whose first dimension is `batch_size`.
    weights: A `[batch_size]`-shaped tensor with each batch member's weight.
    overall_rate: Desired overall rate of resampling.
    scope: Scope to use for the op.
    mean_decay: How quickly to decay the running estimate of the mean weight.
    warmup: Until the resulting tensor has been evaluated `warmup`
      times, the resampling menthod uses the true mean over all calls
      as its weight estimate, rather than a decayed mean.
    seed: Random seed.

  Returns:
    A list of tensors exactly like `inputs`, but with an unknown (and
      possibly zero) first dimension.
    A tensor containing the effective resampling rate used for each output.

  """
    # Algorithm: Just compute rates as weights/mean_weight *
    # overall_rate. This way the the average weight corresponds to the
    # overall rate, and a weight twice the average has twice the rate,
    # etc.
    with ops.name_scope(scope, 'weighted_resample', inputs) as opscope:
        # First: Maintain a running estimated mean weight, with decay
        # adjusted (by also maintaining an invocation count) during the
        # warmup period so that at the beginning, there aren't too many
        # zeros mixed in, throwing the average off.

        with variable_scope.variable_scope(scope, 'estimate_mean', inputs):
            count_so_far = variable_scope.get_local_variable('resample_count',
                                                             initializer=0)

            estimated_mean = variable_scope.get_local_variable(
                'estimated_mean', initializer=0.0)

            count = count_so_far.assign_add(1)
            real_decay = math_ops.minimum(
                math_ops.truediv((count - 1), math_ops.minimum(count, warmup)),
                mean_decay)

            batch_mean = math_ops.reduce_mean(weights)
            mean = moving_averages.assign_moving_average(
                estimated_mean, batch_mean, real_decay)

        # Then, normalize the weights into rates using the mean weight and
        # overall target rate:
        rates = weights * overall_rate / mean

        results = resample_at_rate([rates] + inputs,
                                   rates,
                                   scope=opscope,
                                   seed=seed,
                                   back_prop=False)

        return (results[1:], results[0])
Esempio n. 51
0
    def _build(self, inputs, is_training):
        """Connects the module to some inputs.

    Args:
      inputs: Tensor, final dimension must be equal to embedding_dim. All other
        leading dimensions will be flattened and treated as a large batch.
      is_training: boolean, whether this connection is to training data. When
        this is set to False, the internal moving average statistics will not be
        updated.

    Returns:
      dict containing the following keys and values:
        quantize: Tensor containing the quantized version of the input.
        loss: Tensor containing the loss to optimize.
        perplexity: Tensor containing the perplexity of the encodings.
        encodings: Tensor containing the discrete encodings, ie which element
          of the quantized space each input element was mapped to.
        encoding_indices: Tensor containing the discrete encoding indices, ie
          which element of the quantized space each input element was mapped to.
    """
        # Ensure that the weights are read fresh for each timestep, which otherwise
        # would not be guaranteed in an RNN setup. Note that this relies on inputs
        # having a data dependency with the output of the previous timestep - if
        # this is not the case, there is no way to serialize the order of weight
        # updates within the module, so explicit external dependencies must be used.
        with tf.control_dependencies([inputs]):
            w = self._w.read_value()
        input_shape = tf.shape(inputs)
        with tf.control_dependencies([
                tf.Assert(tf.equal(input_shape[-1], self._embedding_dim),
                          [input_shape])
        ]):
            flat_inputs = tf.reshape(inputs, [-1, self._embedding_dim])

        distances = (tf.reduce_sum(flat_inputs**2, 1, keepdims=True) -
                     2 * tf.matmul(flat_inputs, w) +
                     tf.reduce_sum(w**2, 0, keepdims=True))

        encoding_indices = tf.argmax(-distances, 1)
        encodings = tf.one_hot(encoding_indices, self._num_embeddings)
        encoding_indices = tf.reshape(encoding_indices, tf.shape(inputs)[:-1])
        quantized = self.quantize(encoding_indices)
        e_latent_loss = tf.reduce_mean(
            (tf.stop_gradient(quantized) - inputs)**2)

        if is_training:
            updated_ema_cluster_size = moving_averages.assign_moving_average(
                self._ema_cluster_size, tf.reduce_sum(encodings, 0),
                self._decay)
            dw = tf.matmul(flat_inputs, encodings, transpose_a=True)
            updated_ema_w = moving_averages.assign_moving_average(
                self._ema_w, dw, self._decay)
            n = tf.reduce_sum(updated_ema_cluster_size)
            updated_ema_cluster_size = (
                (updated_ema_cluster_size + self._epsilon) /
                (n + self._num_embeddings * self._epsilon) * n)

            normalised_updated_ema_w = (
                updated_ema_w / tf.reshape(updated_ema_cluster_size, [1, -1]))
            with tf.control_dependencies([e_latent_loss]):
                update_w = tf.assign(self._w, normalised_updated_ema_w)
                with tf.control_dependencies([update_w]):
                    loss = self._commitment_cost * e_latent_loss

        else:
            loss = self._commitment_cost * e_latent_loss
        quantized = inputs + tf.stop_gradient(quantized - inputs)
        avg_probs = tf.reduce_mean(encodings, 0)
        perplexity = tf.exp(-tf.reduce_sum(avg_probs *
                                           tf.log(avg_probs + 1e-10)))

        return {
            'quantize': quantized,
            'loss': loss,
            'perplexity': perplexity,
            'encodings': encodings,
            'encoding_indices': encoding_indices,
        }
Esempio n. 52
0
    def layer_op(self, inputs, is_training, use_local_stats=False):
        input_shape = inputs.shape

        # operates on all dims except the last dim
        params_shape = input_shape[-1:]
        axes = list(range(input_shape.ndims - 1))

        # create trainable variables and moving average variables
        beta = tf.get_variable('beta',
                               shape=params_shape,
                               initializer=self.initializers['beta'],
                               regularizer=self.regularizers['beta'],
                               dtype=tf.float32,
                               trainable=True)
        gamma = tf.get_variable('gamma',
                                shape=params_shape,
                                initializer=self.initializers['gamma'],
                                regularizer=self.regularizers['gamma'],
                                dtype=tf.float32,
                                trainable=True)

        collections = [tf.GraphKeys.GLOBAL_VARIABLES]
        moving_mean = tf.get_variable(
            'moving_mean',
            shape=params_shape,
            initializer=self.initializers['moving_mean'],
            dtype=tf.float32,
            trainable=False,
            collections=collections)
        moving_variance = tf.get_variable(
            'moving_variance',
            shape=params_shape,
            initializer=self.initializers['moving_variance'],
            dtype=tf.float32,
            trainable=False,
            collections=collections)

        # mean and var
        mean, variance = tf.nn.moments(inputs, axes)
        update_moving_mean = moving_averages.assign_moving_average(
            moving_mean, mean, self.moving_decay).op
        update_moving_variance = moving_averages.assign_moving_average(
            moving_variance, variance, self.moving_decay).op
        tf.add_to_collection(BN_COLLECTION, update_moving_mean)
        tf.add_to_collection(BN_COLLECTION, update_moving_variance)

        # call the normalisation function
        if is_training or use_local_stats:
            # with tf.control_dependencies(
            #         [update_moving_mean, update_moving_variance]):
            outputs = tf.nn.batch_normalization(inputs,
                                                mean,
                                                variance,
                                                beta,
                                                gamma,
                                                self.eps,
                                                name='batch_norm')
        else:
            outputs = tf.nn.batch_normalization(inputs,
                                                moving_mean,
                                                moving_variance,
                                                beta,
                                                gamma,
                                                self.eps,
                                                name='batch_norm')
        outputs.set_shape(inputs.get_shape())
        return outputs
Esempio n. 53
0
    def build_candidate(self,
                        ensemble_spec,
                        training,
                        iteration_step,
                        summary,
                        is_previous_best=False,
                        track_moving_average=True):
        """Builds and returns an AdaNet candidate.

    Args:
      ensemble_spec: `_EnsembleSpec` instance to track.
      training: A python boolean indicating whether the graph is in training
        mode or prediction mode.
      iteration_step: Integer `Tensor` representing the step since the beginning
        of the current iteration, as opposed to the global step.
      summary: A `Summary` for recording summaries for TensorBoard.
      is_previous_best: Bool identifying whether this ensemble came from a
        previous iteration. If `True`, `is_training` will be `False` since its
        weights are frozen.
      track_moving_average: Bool whether to track the moving average of the
        ensemble's adanet loss.

    Returns:
      A _Candidate instance.
    """

        candidate_scope = "candidate_{}".format(ensemble_spec.name)

        with tf_compat.v1.variable_scope(candidate_scope):
            adanet_loss = ensemble_spec.adanet_loss
            if track_moving_average:
                adanet_loss = tf_compat.v1.get_variable("adanet_loss",
                                                        initializer=0.,
                                                        trainable=False)

            if is_previous_best:
                # This candidate is frozen, so it is already done training.
                is_training = tf.constant(False, name="is_training")
            elif self._max_steps is not None:
                # Train this candidate for `max_steps` steps.
                # NOTE: During training, the iteration step gets incremented at the very
                # end of the computation graph, so we need to account for that here.
                is_training = tf.less(iteration_step + 1 if training else 0,
                                      self._max_steps,
                                      name="is_training")
            else:
                # Train this candidate forever.
                is_training = tf.constant(True, name="is_training")

            if training and track_moving_average:
                update_adanet_loss_op = moving_averages.assign_moving_average(
                    adanet_loss,
                    ensemble_spec.adanet_loss,
                    decay=self._adanet_loss_decay)
                with tf.control_dependencies([update_adanet_loss_op]):
                    adanet_loss = adanet_loss.read_value()

                with summary.current_scope():
                    summary.scalar(
                        "adanet_loss/adanet/adanet_weighted_ensemble",
                        adanet_loss)
            return _Candidate(ensemble_spec=ensemble_spec,
                              adanet_loss=adanet_loss,
                              is_training=is_training,
                              is_previous_best=is_previous_best)
Esempio n. 54
0
 def mean_var_with_update():
     axes = list(range(len(x.get_shape()) - 1))
     batch_mean, batch_var = tf.nn.moments(x, axes=axes, name='moments')
     with tf.control_dependencies([assign_moving_average(pop_mean, batch_mean, decay),
                                   assign_moving_average(pop_var, batch_var, decay)]):
         return tf.identity(batch_mean), tf.identity(batch_var)
Esempio n. 55
0
def add_moving_summary(*args, **kwargs):
    """
    Summarize the moving average for scalar tensors.
    This function is a no-op if not calling from main training tower.

    Args:
        args: scalar tensors to summarize
        decay (float): the decay rate. Defaults to 0.95.
        collection (str or None): the name of the collection to add EMA-maintaining ops.
            The default will work together with the default
            :class:`MovingAverageSummary` callback.
        summary_collections ([str]): the names of collections to add the
            summary op. Default is TF's default (`tf.GraphKeys.SUMMARIES`).

    Returns:
        [tf.Tensor]: list of tensors returned by assign_moving_average,
            which can be used to maintain the EMA.
    """
    decay = kwargs.pop('decay', 0.95)
    coll = kwargs.pop('collection', MOVING_SUMMARY_OPS_KEY)
    summ_coll = kwargs.pop('summary_collections', None)
    assert len(kwargs) == 0, "Unknown arguments: " + str(kwargs)

    ctx = get_current_tower_context()
    # allow ctx to be none
    if ctx is not None and not ctx.is_main_training_tower:
        return []
    if tf.get_variable_scope().reuse is True:
        logger.warn(
            "add_moving_summary() called under reuse=True scope, ignored.")
        return []

    for x in args:
        assert isinstance(x, (tf.Tensor, tf.Variable)), x
        assert x.get_shape().ndims == 0, \
            "add_moving_summary() only accepts scalar tensor! Got one with {}".format(x.get_shape())
    # TODO variable not saved under distributed

    ema_ops = []
    for c in args:
        name = re.sub('tower[0-9]+/', '', c.op.name)
        with tf.name_scope(None):
            if not c.dtype.is_floating:
                c = tf.cast(c, tf.float32)
            # assign_moving_average creates variables with op names, therefore clear ns first.
            with _enter_vs_reuse_ns('EMA') as vs:
                ema_var = tf.get_variable(
                    name,
                    shape=c.shape,
                    dtype=c.dtype,
                    initializer=tf.constant_initializer(),
                    trainable=False)
                ns = vs.original_name_scope
            with tf.name_scope(ns):  # reuse VS&NS so that EMA_1 won't appear
                ema_op = moving_averages.assign_moving_average(
                    ema_var,
                    c,
                    decay,
                    zero_debias=True,
                    name=name + '_EMA_apply')
            ema_ops.append(ema_op)
        with tf.name_scope(None):
            tf.summary.scalar(
                name + '-summary', ema_op,
                collections=summ_coll)  # write the EMA value as a summary
    if coll is not None:
        for op in ema_ops:
            tf.add_to_collection(coll, op)
    return ema_ops
Esempio n. 56
0
    def _quantizable_concat(self,
                            inputs,
                            axis,
                            is_training,
                            is_quantized=True,
                            default_min=0,
                            default_max=6,
                            ema_decay=0.999,
                            scope='quantized_concat'):
        """Concat replacement with quantization option.

    Allows concat inputs to share the same min max ranges,
    from experimental/gazelle/synthetic/model/tpu/utils.py.

    Args:
      inputs: list of tensors to concatenate.
      axis: dimension along which to concatenate.
      is_training: true if the graph is a training graph.
      is_quantized: flag to enable/disable quantization.
      default_min: default min value for fake quant op.
      default_max: default max value for fake quant op.
      ema_decay: the moving average decay for the quantization variables.
      scope: Optional scope for variable_scope.

    Returns:
      Tensor resulting from concatenation of input tensors
    """
        if is_quantized:
            with tf.variable_scope(scope):
                min_var = self._quant_var('min', default_min)
                max_var = self._quant_var('max', default_max)
                if not is_training:
                    # If we are building an eval graph just use the values in the
                    # variables.
                    quant_inputs = [
                        tf.fake_quant_with_min_max_vars(t, min_var, max_var)
                        for t in inputs
                    ]
                else:
                    concat_tensors = tf.concat(inputs, axis=axis)
                    tf.logging.info(
                        'concat_tensors: {}'.format(concat_tensors))
                    # TFLite requires that 0.0 is always in the [min; max] range.
                    range_min = tf.minimum(tf.reduce_min(concat_tensors),
                                           0.0,
                                           name='SafeQuantRangeMin')
                    range_max = tf.maximum(tf.reduce_max(concat_tensors),
                                           0.0,
                                           name='SafeQuantRangeMax')
                    # Otherwise we need to keep track of the moving averages of the min
                    # and of the elements of the input tensor max.
                    min_val = moving_averages.assign_moving_average(
                        min_var, range_min, ema_decay, name='AssignMinEma')
                    max_val = moving_averages.assign_moving_average(
                        max_var, range_max, ema_decay, name='AssignMaxEma')
                    quant_inputs = [
                        tf.fake_quant_with_min_max_vars(t, min_val, max_val)
                        for t in inputs
                    ]
                outputs = tf.concat(quant_inputs, axis=axis)
        else:
            outputs = tf.concat(inputs, axis=axis)
        return outputs
Esempio n. 57
0
def batch_norm(inputs,
               decay=0.999,
               center=True,
               scale=False,
               epsilon=0.001,
               activation_fn=None,
               updates_collections=ops.GraphKeys.UPDATE_OPS,
               is_training=True,
               reuse=None,
               variables_collections=None,
               outputs_collections=None,
               scope=None):
    """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.

    "Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift"

    Sergey Ioffe, Christian Szegedy

  Can be used as a normalizer function for conv2d and fully_connected.

  Args:
    inputs: a tensor of size `[batch_size, height, width, channels]`
            or `[batch_size, channels]`.
    decay: decay for the moving average.
    center: If True, subtract `beta`. If False, `beta` is ignored.
    scale: If True, multiply by `gamma`. If False, `gamma` is
      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: small float added to variance to avoid dividing by zero.
    activation_fn: Optional activation function.
    updates_collections: collections to collect the update ops for computation.
      If None, a control dependency would be added to make sure the updates are
      computed.
    is_training: whether or not the layer is in training mode. In training mode
      it would accumulate the statistics of the moments into `moving_mean` and
      `moving_variance` using an exponential moving average with the given
      `decay`. When it is not in training mode then it would use the values of
      the `moving_mean` and the `moving_variance`.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    variables_collections: optional collections for the variables.
    outputs_collections: collections to add the outputs.
    scope: Optional scope for `variable_op_scope`.

  Returns:
    a tensor representing the output of the operation.

  """
    with variable_scope.variable_op_scope([inputs],
                                          scope,
                                          'BatchNorm',
                                          reuse=reuse) as sc:
        inputs_shape = inputs.get_shape()
        dtype = inputs.dtype.base_dtype
        axis = list(range(len(inputs_shape) - 1))
        params_shape = inputs_shape[-1:]
        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta = variables.model_variable(
                'beta',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.zeros_initializer,
                collections=beta_collections)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma = variables.model_variable(
                'gamma',
                shape=params_shape,
                dtype=dtype,
                initializer=init_ops.ones_initializer,
                collections=gamma_collections)
        # Create moving_mean and moving_variance variables and add them to the
        # appropiate collections.
        moving_mean_collections = utils.get_variable_collections(
            variables_collections, 'moving_mean')
        moving_mean = variables.model_variable(
            'moving_mean',
            shape=params_shape,
            dtype=dtype,
            initializer=init_ops.zeros_initializer,
            trainable=False,
            collections=moving_mean_collections)
        moving_variance_collections = utils.get_variable_collections(
            variables_collections, 'moving_variance')
        moving_variance = variables.model_variable(
            'moving_variance',
            shape=params_shape,
            dtype=dtype,
            initializer=init_ops.ones_initializer,
            trainable=False,
            collections=moving_variance_collections)
        if is_training:
            # Calculate the moments based on the individual batch.
            mean, variance = nn.moments(inputs, axis, shift=moving_mean)
            # Update the moving_mean and moving_variance moments.
            update_moving_mean = moving_averages.assign_moving_average(
                moving_mean, mean, decay)
            update_moving_variance = moving_averages.assign_moving_average(
                moving_variance, variance, decay)
            if updates_collections is None:
                # Make sure the updates are computed here.
                with ops.control_dependencies(
                    [update_moving_mean, update_moving_variance]):
                    outputs = nn.batch_normalization(inputs, mean, variance,
                                                     beta, gamma, epsilon)
            else:
                # Collect the updates to be computed later.
                ops.add_to_collections(updates_collections, update_moving_mean)
                ops.add_to_collections(updates_collections,
                                       update_moving_variance)
                outputs = nn.batch_normalization(inputs, mean, variance, beta,
                                                 gamma, epsilon)
        else:
            outputs = nn.batch_normalization(inputs, moving_mean,
                                             moving_variance, beta, gamma,
                                             epsilon)
        outputs.set_shape(inputs.get_shape())
        if activation_fn:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections, sc.name,
                                           outputs)
Esempio n. 58
0
    def myBatchNorm(self, x):
        epsilion = 0.001
        decay = 0.9

        with tf.variable_scope('batchNorm'):
            paramsShape = x.get_shape().dims[self.channelIndex]
            gamma = tf.get_variable('gamma',
                                    paramsShape,
                                    tf.float32,
                                    initializer=tf.constant_initializer(
                                        1.0, tf.float32))
            beta = tf.get_variable('beta',
                                   paramsShape,
                                   tf.float32,
                                   initializer=tf.constant_initializer(
                                       0.0, tf.float32))

            if self.isTraining:
                [y, mean, variance
                 ] = tf.nn.fused_batch_norm(x,
                                            gamma,
                                            beta,
                                            data_format=self.dataFormat,
                                            epsilon=epsilion)

                movingMean = tf.get_variable(
                    'movingMean',
                    paramsShape,
                    tf.float32,
                    initializer=tf.constant_initializer(0.0, tf.float32),
                    trainable=False)
                movingVariance = tf.get_variable(
                    'movingVariance',
                    paramsShape,
                    tf.float32,
                    initializer=tf.constant_initializer(1.0, tf.float32),
                    trainable=False)
                #To adjust l2decay for the numerical stability of weight(?), and now with BN there's maybe also a demand for this
                tf.summary.histogram('movingMean', movingMean)
                tf.summary.histogram('movingVariance', movingVariance)

                self.extraTrainOps.append(
                    moving_averages.assign_moving_average(
                        movingMean, mean, decay))
                self.extraTrainOps.append(
                    moving_averages.assign_moving_average(
                        movingVariance, variance, decay))
            else:
                mean = tf.get_variable('movingMean',
                                       paramsShape,
                                       tf.float32,
                                       initializer=tf.constant_initializer(
                                           0.0, tf.float32),
                                       trainable=False)
                variance = tf.get_variable('movingVariance',
                                           paramsShape,
                                           tf.float32,
                                           initializer=tf.constant_initializer(
                                               1.0, tf.float32),
                                           trainable=False)

                [y, _, _] = tf.nn.fused_batch_norm(x,
                                                   gamma,
                                                   beta,
                                                   mean=mean,
                                                   variance=variance,
                                                   epsilon=epsilion,
                                                   data_format=self.dataFormat,
                                                   is_training=self.isTraining)
        return y
Esempio n. 59
0
    def discrete_bottleneck(self, x):
        """Discretization bottleneck for latent variables.
    Args:
        x: Input to the discretization bottleneck.
    Returns:
        Embedding to pass to the decoder, discrete latent, loss, and the
        embedding
        function.
    Raises:
        ValueError: If projection_tensors is None for reshape_method
        project, or
        ema_count or ema_means is None if we are using ema, or unknown
        args.
    """
        x_reshaped = self.slice_hidden(x)
        x_means_hot = []
        x_means = 0
        loss = 0
        x_means_hot, x_means, q_loss, e_loss = self.embedding_lookup(
            x_reshaped, self.means)

        if self.hparams.ema:
            tf.logging.info("Using EMA with beta = {}".format(
                self.hparams.beta))
            updated_ema_count = \
                moving_averages.assign_moving_average(
                    self.ema_count,
                    tf.reduce_sum(
                        tf.reshape(
                            x_means_hot,
                            shape=[-1, self.hparams.num_blocks,
                                   self.hparams.block_v_size]),
                        axis=0),
                    self.hparams.decay,
                    zero_debias=False)

            dw = tf.matmul(tf.transpose(x_means_hot, perm=[1, 2, 0]),
                           tf.transpose(x_reshaped, perm=[1, 0, 2]))

            updated_ema_means = \
                moving_averages.assign_moving_average(
                    self.ema_means, dw, self.hparams.decay,
                    zero_debias=False)
            n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True)
            updated_ema_count = (
                (updated_ema_count + self.hparams.epsilon) /
                (n + 2**self.hparams.z_size * self.hparams.epsilon) * n)
            updated_ema_means = updated_ema_means / tf.expand_dims(
                updated_ema_count, axis=-1)

            with tf.control_dependencies([e_loss]):
                update_means = tf.assign(self.means, updated_ema_means)
                with tf.control_dependencies([update_means]):
                    loss += self.hparams.beta * e_loss
        else:
            # Use a gradient based loss for learning the cluster centers
            loss += q_loss + self.hparams.beta * e_loss

        # Get the discrete latent representation
        x_means_idx = tf.argmax(x_means_hot, axis=-1)

        # Get the binary representation
        num_bits = int(self.hparams.z_size // self.hparams.num_blocks)
        x_means_bits = self.int_to_bit(x_means_idx, num_bits=num_bits, base=2)
        x_discrete = self.bit_to_int(tf.to_int32(x_means_bits),
                                     num_bits=self.hparams.z_size,
                                     base=2)

        # Reshape x_discrete
        shape_x = shape_list(x)
        shape_discrete = shape_x[:-1]
        x_discrete = tf.reshape(x_discrete, shape_discrete)
        x_means = tf.reshape(x_means, shape=shape_x)
        h1 = x + tf.stop_gradient(x_means - x)

        h2 = tf.layers.dense(tf.nn.relu(h1),
                             self.hparams.filter_size,
                             name="vch2")
        res = tf.layers.dense(tf.nn.relu(h2),
                              self.hparams.hidden_size,
                              name="vcfin")
        embed_fn = partial(self.embed)
        return {
            "dense": res,
            "discrete": x_discrete,
            "loss": loss,
            "embed": embed_fn
        }
Esempio n. 60
0
  def call(self, inputs, training=False):
    # First, compute the axes along which to reduce the mean / variance,
    # as well as the broadcast shape to be used for all parameters.
    input_shape = inputs.get_shape()
    ndim = len(input_shape)
    reduction_axes = list(range(len(input_shape)))
    del reduction_axes[self.axis]
    broadcast_shape = [1] * len(input_shape)
    broadcast_shape[self.axis] = input_shape[self.axis].value

    # Determines whether broadcasting is needed.
    needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1])

    scale, offset = self.gamma, self.beta

    # Determine a boolean value for `training`: could be True, False, or None.
    training_value = utils.constant_value(training)
    if training_value is not False:
      # Some of the computations here are not necessary when training==False
      # but not a constant. However, this makes the code simpler.
      mean, variance = nn.moments(inputs, reduction_axes)
      mean = _smart_select(training,
                           lambda: mean,
                           lambda: self.moving_mean)
      variance = _smart_select(training,
                               lambda: variance,
                               lambda: self.moving_variance)

      if self.renorm:
        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
            mean, variance, training)
        # When training, the normalized values (say, x) will be transformed as
        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
        # = x * (r * gamma) + (d * gamma + beta) with renorm.
        scale = array_ops.stop_gradient(r, name='renorm_r')
        offset = array_ops.stop_gradient(d, name='renorm_d')
        if self.gamma is not None:
          scale *= self.gamma
          offset *= self.gamma
        if self.beta is not None:
          offset += self.beta
      else:
        new_mean, new_variance = mean, variance

      # Update moving averages when training, and prevent updates otherwise.
      decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
      mean_update = moving_averages.assign_moving_average(
          self.moving_mean, new_mean, decay, zero_debias=False)
      variance_update = moving_averages.assign_moving_average(
          self.moving_variance, new_variance, decay, zero_debias=False)

      if not self.updates:
        # In the future this should be refactored into a self.add_update
        # methods in order to allow for instance-based BN layer sharing
        # across unrelated input streams (e.g. like in Keras).
        self.updates.append(mean_update)
        self.updates.append(variance_update)

    else:
      mean, variance = self.moving_mean, self.moving_variance

    def _broadcast(v):
      if needs_broadcasting and v is not None:
        # In this case we must explictly broadcast all parameters.
        return array_ops.reshape(v, broadcast_shape)
      return v

    return nn.batch_normalization(inputs,
                                  _broadcast(mean),
                                  _broadcast(variance),
                                  _broadcast(offset),
                                  _broadcast(scale),
                                  self.epsilon)