Exemple #1
0
def max_pool_2d_nxn_regions(inputs, pool_dimension, mode):
    """
  Args:
    inputs: The tensor over which to pool. Must have rank 4.
    pool_dimension: The dimenstion level(bin size)
      over which spatial pooling is performed.
    mode: Pooling mode 'max' or 'avg'.
  Returns:
    The output list of (pool_dimension * pool_dimension) tensors.
  """
    inputs_shape = array_ops.shape(inputs)
    h = math_ops.cast(array_ops.gather(inputs_shape, 1), dtypes.int32)
    w = math_ops.cast(array_ops.gather(inputs_shape, 2), dtypes.int32)

    if mode == 'max':
        pooling_op = math_ops.reduce_max
    elif mode == 'avg':
        pooling_op = math_ops.reduce_mean
    else:
        msg = "Mode must be either 'max' or 'avg'. Got '{0}'"
        raise ValueError(msg.format(mode))

    result = []
    n = pool_dimension
    for row in range(pool_dimension):
        for col in range(pool_dimension):
            # start_h = floor(row / n * h)
            start_h = math_ops.cast(
                math_ops.floor(
                    math_ops.multiply(math_ops.divide(row, n),
                                      math_ops.cast(h, dtypes.float32))),
                dtypes.int32)
            # end_h = ceil((row + 1) / n * h)
            end_h = math_ops.cast(
                math_ops.ceil(
                    math_ops.multiply(math_ops.divide((row + 1), n),
                                      math_ops.cast(h, dtypes.float32))),
                dtypes.int32)
            # start_w = floor(col / n * w)
            start_w = math_ops.cast(
                math_ops.floor(
                    math_ops.multiply(math_ops.divide(col, n),
                                      math_ops.cast(w, dtypes.float32))),
                dtypes.int32)
            # end_w = ceil((col + 1) / n * w)
            end_w = math_ops.cast(
                math_ops.ceil(
                    math_ops.multiply(math_ops.divide((col + 1), n),
                                      math_ops.cast(w, dtypes.float32))),
                dtypes.int32)
            pooling_region = inputs[:, start_h:end_h, start_w:end_w, :]
            pool_result = pooling_op(pooling_region, axis=(1, 2))
            result.append(pool_result)
    return result
def frames(signal, frame_length, frame_step, name=None):
  """Frame a signal into overlapping frames.

  May be used in front of spectral functions.

  For example:

  ```python
  pcm = tf.placeholder(tf.float32, [None, 9152])
  frames = tf.contrib.signal.frames(pcm, 512, 180)
  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
  image = tf.expand_dims(magspec, 3)
  ```

  Args:
    signal: A `Tensor` of shape `[batch_size, signal_length]`.
    frame_length: An `int32` or `int64` `Tensor`. The length of each frame.
    frame_step: An `int32` or `int64` `Tensor`. The step between frames.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of frames with shape `[batch_size, num_frames, frame_length]`.

  Raises:
    ValueError: if signal does not have rank 2.
  """
  with ops.name_scope(name, "frames", [signal, frame_length, frame_step]):
    signal = ops.convert_to_tensor(signal, name="signal")
    frame_length = ops.convert_to_tensor(frame_length, name="frame_length")
    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")

    signal_rank = signal.shape.ndims

    if signal_rank != 2:
      raise ValueError("expected signal to have rank 2 but was " + signal_rank)

    signal_length = array_ops.shape(signal)[1]

    num_frames = math_ops.ceil((signal_length - frame_length) / frame_step)
    num_frames = 1 + math_ops.cast(num_frames, dtypes.int32)

    pad_length = (num_frames - 1) * frame_step + frame_length
    pad_signal = array_ops.pad(signal, [[0, 0], [0,
                                                 pad_length - signal_length]])

    indices_frame = array_ops.expand_dims(math_ops.range(frame_length), 0)
    indices_frames = array_ops.tile(indices_frame, [num_frames, 1])

    indices_step = array_ops.expand_dims(
        math_ops.range(num_frames) * frame_step, 1)
    indices_steps = array_ops.tile(indices_step, [1, frame_length])

    indices = indices_frames + indices_steps

    # TODO(androbin): remove `transpose` when `gather` gets `axis` support
    pad_signal = array_ops.transpose(pad_signal)
    signal_frames = array_ops.gather(pad_signal, indices)
    signal_frames = array_ops.transpose(signal_frames, perm=[2, 0, 1])

    return signal_frames
  def _log_survival_function(self, y):
    low = self._low
    high = self._high

    # Recall the promise:
    # survival_function(y) := P[Y > y]
    #                       = 0, if y >= high,
    #                       = 1, if y < low,
    #                       = P[X > y], otherwise.

    # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
    # between.
    j = math_ops.ceil(y)

    # P[X > j], used when low < X < high.
    result_so_far = self.distribution.log_survival_function(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if low is not None:
      result_so_far = array_ops.where(j < low,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)
    if high is not None:
      neg_inf = -np.inf * array_ops.ones_like(result_so_far)
      result_so_far = array_ops.where(j >= high, neg_inf, result_so_far)

    return result_so_far
Exemple #4
0
    def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
                   power, cycle, name):
        """Helper to recompute learning rate; most helpful in eager-mode."""
        with ops.name_scope(name, "PolynomialDecay", [
                learning_rate, global_step, decay_steps, end_learning_rate,
                power
        ]) as name:
            learning_rate = ops.convert_to_tensor(learning_rate,
                                                  name="learning_rate")
            dtype = learning_rate.dtype
            end_learning_rate = math_ops.cast(end_learning_rate, dtype)
            power = math_ops.cast(power, dtype)

            global_step_recomp = math_ops.cast(global_step, dtype)
            decay_steps_recomp = math_ops.cast(decay_steps, dtype)
            if cycle:
                # Find the first multiple of decay_steps that is bigger than
                # global_step. If global_step is zero set the multiplier to 1
                multiplier = control_flow_ops.cond(
                    math_ops.equal(global_step_recomp, 0), lambda: 1.0,
                    lambda: math_ops.ceil(global_step_recomp / decay_steps))
                decay_steps_recomp = math_ops.multiply(decay_steps_recomp,
                                                       multiplier)
            else:
                # Make sure that the global_step used is not bigger than decay_steps.
                global_step_recomp = math_ops.minimum(global_step_recomp,
                                                      decay_steps)

            p = math_ops.div(global_step_recomp, decay_steps_recomp)
            return math_ops.add(math_ops.multiply(
                learning_rate - end_learning_rate, math_ops.pow(1 - p, power)),
                                end_learning_rate,
                                name=name)
  def _survival_function(self, y):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff

    # Recall the promise:
    # survival_function(y) := P[Y > y]
    #                       = 0, if y >= upper_cutoff,
    #                       = 1, if y < lower_cutoff,
    #                       = P[X > y], otherwise.

    # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
    # between.
    j = math_ops.ceil(y)

    # P[X > j], used when lower_cutoff < X < upper_cutoff.
    result_so_far = self.distribution.survival_function(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if lower_cutoff is not None:
      result_so_far = math_ops.select(j < lower_cutoff,
                                      array_ops.ones_like(result_so_far),
                                      result_so_far)
    if upper_cutoff is not None:
      result_so_far = math_ops.select(j >= upper_cutoff,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
Exemple #6
0
    def _survival_function(self, y):
        lower_cutoff = self._lower_cutoff
        upper_cutoff = self._upper_cutoff

        # Recall the promise:
        # survival_function(y) := P[Y > y]
        #                       = 0, if y >= upper_cutoff,
        #                       = 1, if y < lower_cutoff,
        #                       = P[X > y], otherwise.

        # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
        # between.
        j = math_ops.ceil(y)

        # P[X > j], used when lower_cutoff < X < upper_cutoff.
        result_so_far = self.distribution.survival_function(j)

        # Broadcast, because it's possible that this is a single distribution being
        # evaluated on a number of samples, or something like that.
        j += array_ops.zeros_like(result_so_far)

        # Re-define values at the cutoffs.
        if lower_cutoff is not None:
            result_so_far = array_ops.where(j < lower_cutoff,
                                            array_ops.ones_like(result_so_far),
                                            result_so_far)
        if upper_cutoff is not None:
            result_so_far = array_ops.where(
                j >= upper_cutoff, array_ops.zeros_like(result_so_far),
                result_so_far)

        return result_so_far
Exemple #7
0
    def __call__(self, step):
        with ops.name_scope_v2(self.name or "PolynomialDecay") as name:
            initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
                self.initial_learning_rate, name="initial_learning_rate")
            dtype = initial_learning_rate.dtype
            end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
            power = math_ops.cast(self.power, dtype)

            global_step_recomp = math_ops.cast(step, dtype)
            decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
            if self.cycle:
                # Find the first multiple of decay_steps that is bigger than
                # global_step. If global_step is zero set the multiplier to 1
                multiplier = control_flow_ops.cond(
                    math_ops.equal(global_step_recomp, 0), lambda: 1.0, lambda:
                    math_ops.ceil(global_step_recomp / self.decay_steps))
                decay_steps_recomp = math_ops.multiply(decay_steps_recomp,
                                                       multiplier)
            else:
                # Make sure that the global_step used is not bigger than decay_steps.
                global_step_recomp = math_ops.minimum(global_step_recomp,
                                                      decay_steps_recomp)

            p = math_ops.divide(global_step_recomp, decay_steps_recomp)
            return math_ops.add(math_ops.multiply(
                initial_learning_rate - end_learning_rate,
                math_ops.pow(1 - p, power)),
                                end_learning_rate,
                                name=name)
  def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
                 power, cycle, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(
        name, "PolynomialDecay",
        [learning_rate, global_step, decay_steps, end_learning_rate, power]
    ) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      end_learning_rate = math_ops.cast(end_learning_rate, dtype)
      power = math_ops.cast(power, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
      if cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
  def __call__(self, step):
    with ops.name_scope(
        self.name, "PolynomialDecay",
        [self.initial_learning_rate, step, self.decay_steps,
         self.end_learning_rate, self.power]
    ) as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
      power = math_ops.cast(self.power, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
      if self.cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / self.decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp,
                                              self.decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(initial_learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
    def get_selection_mask(self, input_ids, axis):
        selectable = super(RandomItemSelector,
                           self).get_selectable(input_ids, axis)

        # Run the selection algorithm on positions RT
        positions_flat = math_ops.range(array_ops.size(input_ids.flat_values))
        positions = input_ids.with_flat_values(positions_flat)
        # Mask out positions that are not selectable
        positions = ragged_array_ops.boolean_mask(positions, selectable)

        # merge to the desired axis
        positions = positions.merge_dims(1, axis) if axis > 1 else positions

        # Figure out how many we are going to select
        num_to_select = math_ops.ceil(
            math_ops.cast(positions.row_lengths(), dtypes.float32) *
            self.selection_rate)
        num_to_select = math_ops.minimum(num_to_select,
                                         self.max_selections_per_batch)
        num_to_select = math_ops.cast(num_to_select, dtypes.int64)

        # Shuffle and trim to items that are going to be selected
        def _shuffle_and_trim(x):
            positions, top_n = x
            if isinstance(positions, ragged_tensor.RaggedTensor):
                positions_at_axis = math_ops.range(positions.nrows())
                chosen_positions_at_axis = self._shuffle_fn(
                    positions_at_axis)[:top_n]
                return array_ops.gather(positions, chosen_positions_at_axis)
            else:
                shuffled = self._shuffle_fn(positions)
                return shuffled[:top_n]

        selected_for_mask = map_fn.map_fn(
            _shuffle_and_trim, (positions, num_to_select),
            fn_output_signature=ragged_tensor.RaggedTensorSpec(
                ragged_rank=positions.ragged_rank - 1, dtype=positions.dtype))
        selected_for_mask.flat_values.set_shape([None])

        # Construct the result which is a boolean RT
        # Scatter 1's to positions that have been selected_for_mask
        update_values = array_ops.ones_like(selected_for_mask.flat_values)
        update_values = math_ops.cast(update_values, input_ids.dtype)
        update_indices = selected_for_mask.flat_values
        update_indices = array_ops.expand_dims(update_indices, -1)
        update_indices = math_ops.cast(update_indices, input_ids.dtype)

        results_flat = array_ops.zeros_like(input_ids.flat_values)
        results_flat = gen_array_ops.tensor_scatter_update(
            results_flat, update_indices, update_values)
        results = math_ops.cast(input_ids.with_flat_values(results_flat),
                                dtypes.bool)

        if axis < results.ragged_rank:
            reduce_axis = list(range(results.ragged_rank, axis, -1))
            results = math_ops.reduce_all(results, reduce_axis)
        return results
Exemple #11
0
 def testCeil(self):
     x = np.arange(-5.0, 5.0, .25)
     for dtype in [np.float32, np.double, np.int32]:
         x_np = np.array(x, dtype=dtype)
         x_tf = constant_op.constant(x_np, shape=x_np.shape)
         y_tf = math_ops.ceil(x_tf)
         y_tf_np = self.evaluate(y_tf)
         y_np = np.ceil(x_np)
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
Exemple #12
0
def odeint_fixed(func, y0, t, dt=None, method='rk4', name=None):
    """ODE integration on a fixed grid (with no step size control).
        Useful in certain scenarios to avoid the overhead of adaptive step size
        control, e.g. when differentiation of the integration result is desired and/or
        the time grid is known a priori to be sufficient.

    Args:
        func (function)             : Function that maps a Tensor holding the state `y` and a scalar Tensor
                                      `t` into a Tensor of state derivatives with respect to time.
        y0 (float or complex)       : N-D Tensor giving starting value of `y` at time point `t[0]`.
        t (float)                   : 1-D Tensor holding a sequence of time points for which to solve for and each time
                                      must be larger than the previous time. May have any floating point dtype.
        dt (float, optional)        : 0-D or 1-D Tensor providing time step suggestion to be used on time
                                      integration intervals in `t`. 1-D Tensor should provide values
                                      for all intervals, must have 1 less element than that of `t`.
                                      If given a 0-D Tensor, the value is interpreted as time step suggestion
                                      same for all intervals. If passed None, then time step is set to be the
                                      t[1:] - t[:-1]. Defaults to None. The actual step size is obtained by
                                      insuring an integer number of steps per interval, potentially reducing the
                                      time step. Defaults to None.
        method (str, optional)      :  One of 'midpoint' or 'rk4'. Defaults to 'rk4'.
        name (str, optional)        :  Optional name for the resulting operation. Defaults to None.

    Returns:
        y (Tensor)                  : (N+1)-D tensor, where the first dimension corresponds to different
                                      time points. Contains the solved value of y for each desired time point in
                                      `t`, with the initial value `y0` being the first element along the first
                                      dimension.
    Raises:
        ValueError                  : Upon caller errors.
    """

    y0 = ops.convert_to_tensor(y0, name='y0')
    t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')

    intervals = t[1:] - t[:-1]
    if dt is None:
        dt = intervals
    dt = ops.convert_to_tensor(dt, preferred_dtype=dtypes.float64, name='dt')

    steps_on_intervals = math_ops.ceil(intervals / dt)
    dt = intervals / steps_on_intervals
    steps_on_intervals = math_ops.cast(steps_on_intervals, dtype=dtypes.int32)

    _check_input_types(y0, t, dt)
    _check_input_sizes(t, dt)

    with _assert_increasing(t):
        if method == 'midpoint':
            return _MidpointFixedGridIntegrator().integrate(
                func, y0, t, dt, steps_on_intervals)
        elif method == 'rk4':
            return _RK4FixedGridIntegrator().integrate(func, y0, t, dt,
                                                       steps_on_intervals)
        else:
            raise ValueError('method not supported: {!s}'.format(method))
Exemple #13
0
 def _compare(self, x):
   np_floor, np_ceil = np.floor(x), np.ceil(x)
   with self.cached_session() as sess:
     inx = ops.convert_to_tensor(x)
     ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
     tf_floor, tf_ceil = sess.run([ofloor, oceil])
   self.assertAllEqual(np_floor, tf_floor)
   self.assertAllEqual(np_ceil, tf_ceil)
   self.assertShapeEqual(np_floor, ofloor)
   self.assertShapeEqual(np_ceil, oceil)
def _enclosing_power_of_two(value):
  """Return 2**N for integer N such that 2**N >= value."""
  value_static = tensor_util.constant_value(value)
  if value_static is not None:
    return constant_op.constant(
        int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
  return math_ops.cast(
      math_ops.pow(2.0, math_ops.ceil(
          math_ops.log(math_ops.to_float(value)) / math_ops.log(2.0))),
      value.dtype)
  def _compare(self, x):
    np_floor, np_ceil = np.floor(x), np.ceil(x)

    inx = ops.convert_to_tensor(x)
    ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
    tf_floor, tf_ceil = self.evaluate([ofloor, oceil])

    self.assertAllEqual(np_floor, tf_floor)
    self.assertAllEqual(np_ceil, tf_ceil)
    self.assertShapeEqual(np_floor, ofloor)
    self.assertShapeEqual(np_ceil, oceil)
Exemple #16
0
def _enclosing_power_of_two(value):
    """Return 2**N for integer N such that 2**N >= value."""
    value_static = tensor_util.constant_value(value)
    if value_static is not None:
        return constant_op.constant(
            int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
    return math_ops.cast(
        math_ops.pow(
            2.0,
            math_ops.ceil(
                math_ops.log(math_ops.cast(value, dtypes.float32)) /
                math_ops.log(2.0))), value.dtype)
Exemple #17
0
    def testProbAndGradGivesFiniteResultsForCommonEvents(self):
        with self.test_session():
            mu = variables.Variable(0.0, name="mu")
            sigma = variables.Variable(1.0, name="sigma")
            qdist = distributions.QuantizedDistribution(
                distribution=distributions.Normal(mu=mu, sigma=sigma))
            x = math_ops.ceil(4 * rng.rand(100).astype(np.float32) - 2)

            variables.global_variables_initializer().run()

            proba = qdist.prob(x)
            self._assert_all_finite(proba.eval())

            grads = gradients_impl.gradients(proba, [mu, sigma])
            self._assert_all_finite(grads[0].eval())
            self._assert_all_finite(grads[1].eval())
  def testProbAndGradGivesFiniteResultsForCommonEvents(self):
    with self.test_session():
      mu = variables.Variable(0.0, name="mu")
      sigma = variables.Variable(1.0, name="sigma")
      qdist = distributions.QuantizedDistribution(
          distribution=distributions.Normal(
              mu=mu, sigma=sigma))
      x = math_ops.ceil(4 * rng.rand(100).astype(np.float32) - 2)

      variables.global_variables_initializer().run()

      proba = qdist.prob(x)
      self._assert_all_finite(proba.eval())

      grads = gradients_impl.gradients(proba, [mu, sigma])
      self._assert_all_finite(grads[0].eval())
      self._assert_all_finite(grads[1].eval())
Exemple #19
0
    def _apply_dense(self, grad, var):
        lr_t = math_ops.cast(self.lr_t, var.dtype.base_dtype)
        w1_t = math_ops.cast(self.w1_t, var.dtype.base_dtype)
        w2_t = math_ops.cast(self.w2_t, var.dtype.base_dtype)
        w3_t = math_ops.cast(self.w3_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self.beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self.beta2_t, var.dtype.base_dtype)

        beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)

        m = self.get_slot(var, "m")
        m_t = state_ops.assign(m,
                               0.9 * m + grad,
                               use_locking=self._use_locking)
        # adgrad part
        n = self.get_slot(var, "n")
        n_t = state_ops.assign(n,
                               n + grad * grad,
                               use_locking=self._use_locking)
        # adam part
        m_adam = self.get_slot(var, 'm_adam')
        m_adam_t = state_ops.assign(m_adam,
                                    beta1_t * m_adam + (1 - beta1_t) * grad,
                                    use_locking=self._use_locking)
        n_adam = self.get_slot(var, 'n_adam')
        n_adam_t = state_ops.assign(n_adam,
                                    beta2_t * n_adam +
                                    (1 - beta2_t) * grad * grad,
                                    use_locking=self._use_locking)

        # gradient part
        w4_t = (1.0 - math_ops.ceil((w1_t + w2_t + w3_t) / 3.0))

        n_adam_sqrt = math_ops.sqrt(n_adam_t)
        coefficient = math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)
        var_update = state_ops.assign_sub(
            var,
            lr_t * (w1_t * m_t + w2_t * grad / math_ops.sqrt(n_t + 1e-8) +
                    w3_t * coefficient * m_adam_t /
                    (n_adam_sqrt + 1e-8) + w4_t * grad),
            use_locking=self._use_locking)

        return control_flow_ops.group(
            *[var_update, m_t, n_t, m_adam_t, n_adam_t])
    def _sample_n(self, n, seed=None):
        lower_cutoff = self._lower_cutoff
        upper_cutoff = self._upper_cutoff
        with ops.name_scope("transform"):
            n = ops.convert_to_tensor(n, name="n")
            x_samps = self.base_distribution.sample_n(n=n, seed=seed)
            ones = array_ops.ones_like(x_samps)

            # Snap values to the intervals (j - 1, j].
            result_so_far = math_ops.ceil(x_samps)

            if lower_cutoff is not None:
                result_so_far = math_ops.select(result_so_far < lower_cutoff, lower_cutoff * ones, result_so_far)

            if upper_cutoff is not None:
                result_so_far = math_ops.select(result_so_far > upper_cutoff, upper_cutoff * ones, result_so_far)

            return result_so_far
  def _sample_n(self, n, seed=None):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff
    with ops.name_scope("transform"):
      n = ops.convert_to_tensor(n, name="n")
      x_samps = self.distribution.sample_n(n=n, seed=seed)
      ones = array_ops.ones_like(x_samps)

      # Snap values to the intervals (j - 1, j].
      result_so_far = math_ops.ceil(x_samps)

      if lower_cutoff is not None:
        result_so_far = array_ops.where(result_so_far < lower_cutoff,
                                        lower_cutoff * ones, result_so_far)

      if upper_cutoff is not None:
        result_so_far = array_ops.where(result_so_far > upper_cutoff,
                                        upper_cutoff * ones, result_so_far)

      return result_so_far
  def _sample_n(self, n, seed=None):
    low = self._low
    high = self._high
    with ops.name_scope("transform"):
      n = ops.convert_to_tensor(n, name="n")
      x_samps = self.distribution.sample(n, seed=seed)
      ones = array_ops.ones_like(x_samps)

      # Snap values to the intervals (j - 1, j].
      result_so_far = math_ops.ceil(x_samps)

      if low is not None:
        result_so_far = array_ops.where(result_so_far < low,
                                        low * ones, result_so_far)

      if high is not None:
        result_so_far = array_ops.where(result_so_far > high,
                                        high * ones, result_so_far)

      return result_so_far
Exemple #23
0
    def _sample_n(self, n, seed=None):
        low = self._low
        high = self._high
        with ops.name_scope("transform"):
            n = ops.convert_to_tensor(n, name="n")
            x_samps = self.distribution.sample(n, seed=seed)
            ones = array_ops.ones_like(x_samps)

            # Snap values to the intervals (j - 1, j].
            result_so_far = math_ops.ceil(x_samps)

            if low is not None:
                result_so_far = array_ops.where(result_so_far < low,
                                                low * ones, result_so_far)

            if high is not None:
                result_so_far = array_ops.where(result_so_far > high,
                                                high * ones, result_so_far)

            return result_so_far
Exemple #24
0
    def decayed_lr():
      """Helper to recompute learning rate; most helpful in eager-mode."""
      global_step_recomp = math_ops.cast(global_step, dtype)
      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
      if cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
Exemple #25
0
def sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
  """Returns a [batch_size] Tensor with per-example sequence length."""
  with ops.name_scope(None, 'sequence_length') as name_scope:
    row_ids = sp_tensor.indices[:, 0]
    column_ids = sp_tensor.indices[:, 1]
    # Add one to convert column indices to element length
    column_ids += array_ops.ones_like(column_ids)
    # Get the number of elements we will have per example/row
    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)

    # The raw values are grouped according to num_elements;
    # how many entities will we have after grouping?
    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
    # these will get grouped, and the final seq_length is [1, 1]
    seq_length = math_ops.cast(
        math_ops.ceil(seq_length / num_elements), dtypes.int64)

    # If the last n rows do not have ids, seq_length will have shape
    # [batch_size - n]. Pad the remaining values with zeros.
    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
Exemple #26
0
def sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
  """Returns a [batch_size] Tensor with per-example sequence length."""
  with ops.name_scope(None, 'sequence_length') as name_scope:
    row_ids = sp_tensor.indices[:, 0]
    column_ids = sp_tensor.indices[:, 1]
    # Add one to convert column indices to element length
    column_ids += array_ops.ones_like(column_ids)
    # Get the number of elements we will have per example/row
    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)

    # The raw values are grouped according to num_elements;
    # how many entities will we have after grouping?
    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
    # these will get grouped, and the final seq_length is [1, 1]
    seq_length = math_ops.cast(
        math_ops.ceil(seq_length / num_elements), dtypes.int64)

    # If the last n rows do not have ids, seq_length will have shape
    # [batch_size - n]. Pad the remaining values with zeros.
    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
Exemple #27
0
def percentile(x,
               q,
               axis=None,
               interpolation=None,
               keep_dims=False,
               validate_args=False,
               name=None):
  """Compute the `q`-th percentile of `x`.

  Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
  way from the minimum to the maximum in a sorted copy of `x`.

  The values and distances of the two nearest neighbors as well as the
  `interpolation` parameter will determine the percentile if the normalized
  ranking does not match the location of `q` exactly.

  This function is the same as the median if `q = 50`, the same as the minimum
  if `q = 0` and the same as the maximum if `q = 100`.


  ```python
  # Get 30th percentile with default ('nearest') interpolation.
  x = [1., 2., 3., 4.]
  percentile(x, q=30.)
  ==> 2.0

  # Get 30th percentile with 'lower' interpolation
  x = [1., 2., 3., 4.]
  percentile(x, q=30., interpolation='lower')
  ==> 1.0

  # Get 100th percentile (maximum).  By default, this is computed over every dim
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100.)
  ==> 4.0

  # Treat the leading dim as indexing samples, and find the 100th quantile (max)
  # over all such samples.
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100., axis=[0])
  ==> [3., 4.]
  ```

  Compare to `numpy.percentile`.

  Args:
    x:  Floating point `N-D` `Tensor` with `N > 0`.  If `axis` is not `None`,
      `x` must have statically known number of dimensions.
    q:  Scalar `Tensor` in `[0, 100]`. The percentile.
    axis:  Optional `0-D` or `1-D` integer `Tensor` with constant values.
      The axis that hold independent samples over which to return the desired
      percentile.  If `None` (the default), treat every dimension as a sample
      dimension, returning a scalar.
    interpolation : {"lower", "higher", "nearest"}.  Default: "nearest"
      This optional parameter specifies the interpolation method to
      use when the desired quantile lies between two data points `i < j`:
        * lower: `i`.
        * higher: `j`.
        * nearest: `i` or `j`, whichever is nearest.
    keep_dims:  Python `bool`. If `True`, the last dimension is kept with size 1
      If `False`, the last dimension is removed from the output shape.
    validate_args:  Whether to add runtime checks of argument validity.
      If False, and arguments are incorrect, correct behavior is not guaranteed.
    name:  A Python string name to give this `Op`.  Default is "percentile"

  Returns:
    A `(N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if
      `axis` is `None`, a scalar.

  Raises:
    ValueError:  If argument 'interpolation' is not an allowed type.
  """
  name = name or "percentile"
  allowed_interpolations = {"lower", "higher", "nearest"}

  if interpolation is None:
    interpolation = "nearest"
  else:
    if interpolation not in allowed_interpolations:
      raise ValueError("Argument 'interpolation' must be in %s.  Found %s" %
                       (allowed_interpolations, interpolation))

  with ops.name_scope(name, [x, q]):
    x = ops.convert_to_tensor(x, name="x")
    q = math_ops.to_float(q, name="q")
    _get_static_ndims(q, expect_ndims=0)

    if validate_args:
      q = control_flow_ops.with_dependencies([
          check_ops.assert_rank(q, 0), check_ops.assert_greater_equal(q, 0.),
          check_ops.assert_less_equal(q, 100.)
      ], q)

    if axis is None:
      y = array_ops.reshape(x, [-1])
    else:
      axis = ops.convert_to_tensor(axis, name="axis")
      check_ops.assert_integer(axis)
      axis_ndims = _get_static_ndims(
          axis, expect_static=True, expect_ndims_no_more_than=1)
      axis_const = tensor_util.constant_value(axis)
      if axis_const is None:
        raise ValueError(
            "Expected argument 'axis' to be statically available.  Found: %s" %
            axis)
      axis = axis_const
      if axis_ndims == 0:
        axis = [axis]
      axis = [int(a) for a in axis]
      x_ndims = _get_static_ndims(
          x, expect_static=True, expect_ndims_at_least=1)
      axis = _make_static_axis_non_negative(axis, x_ndims)
      y = _move_dims_to_flat_end(x, axis, x_ndims)

    frac_at_q_or_above = 1. - q / 100.
    d = math_ops.to_float(array_ops.shape(y)[-1])

    if interpolation == "lower":
      index = math_ops.ceil((d - 1) * frac_at_q_or_above)
    elif interpolation == "higher":
      index = math_ops.floor((d - 1) * frac_at_q_or_above)
    elif interpolation == "nearest":
      index = math_ops.round((d - 1) * frac_at_q_or_above)

    # Sort everything, not just the top 'k' entries, which allows multiple calls
    # to sort only once (under the hood) and use CSE.
    sorted_y = _sort_tensor(y)

    # result.shape = B
    result = sorted_y[..., math_ops.to_int32(index)]
    result.set_shape(y.get_shape()[:-1])

    if keep_dims:
      if axis is None:
        # ones_vec = [1, 1,..., 1], total length = len(S) + len(B).
        ones_vec = array_ops.ones(
            shape=[_get_best_effort_ndims(x)], dtype=dtypes.int32)
        result *= array_ops.ones(ones_vec, dtype=x.dtype)
      else:
        result = _insert_back_keep_dims(result, axis)

    return result
def polynomial_decay(learning_rate,
                     global_step,
                     decay_steps,
                     end_learning_rate=0.0001,
                     power=1.0,
                     cycle=False,
                     name=None):
    """Applies a polynomial decay to the learning rate.

  It is commonly observed that a monotonically decreasing learning rate, whose
  degree of change is carefully chosen, results in a better performing model.
  This function applies a polynomial decay function to a provided initial
  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.

  It requires a `global_step` value to compute the decayed learning rate.  You
  can just pass a TensorFlow variable that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  global_step = min(global_step, decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  If `cycle` is True then a multiple of `decay_steps` is used, the first one
  that is bigger than `global_steps`.

  ```python
  decay_steps = decay_steps * ceil(global_step / decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.1
  end_learning_rate = 0.01
  decay_steps = 10000
  learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
                                            decay_steps, end_learning_rate,
                                            power=0.5)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Must be positive.  See the decay computation above.
    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The minimal end learning rate.
    power: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The power of the polynomial. Defaults to linear, 1.0.
    cycle: A boolean, whether or not it should cycle beyond decay_steps.
    name: String.  Optional name of the operation. Defaults to
      'PolynomialDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.

  Raises:
    ValueError: if `global_step` is not supplied.
  """
    if global_step is None:
        raise ValueError("global_step is required for polynomial_decay.")
    with ops.name_scope(
            name, "PolynomialDecay",
        [learning_rate, global_step, decay_steps, end_learning_rate, power
         ]) as name:
        learning_rate = ops.convert_to_tensor(learning_rate,
                                              name="learning_rate")
        dtype = learning_rate.dtype
        global_step = math_ops.cast(global_step, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        end_learning_rate = math_ops.cast(end_learning_rate, dtype)
        power = math_ops.cast(power, dtype)
        if cycle:
            # Find the first multiple of decay_steps that is bigger than global_step.
            # If global_step is zero set the multiplier to 1
            multiplier = control_flow_ops.cond(
                math_ops.equal(global_step, 0), lambda: 1.0,
                lambda: math_ops.ceil(global_step / decay_steps))
            decay_steps = math_ops.multiply(decay_steps, multiplier)
        else:
            # Make sure that the global_step used is not bigger than decay_steps.
            global_step = math_ops.minimum(global_step, decay_steps)

        p = math_ops.div(global_step, decay_steps)
        return math_ops.add(math_ops.multiply(
            learning_rate - end_learning_rate, math_ops.pow(1 - p, power)),
                            end_learning_rate,
                            name=name)
Exemple #29
0
def odeint_fixed(func, y0, t, dt=None, method='rk4', name=None):
  """ODE integration on a fixed grid (with no step size control).

  Useful in certain scenarios to avoid the overhead of adaptive step size
  control, e.g. when differentiation of the integration result is desired and/or
  the time grid is known a priori to be sufficient.

  Args:
    func: Function that maps a Tensor holding the state `y` and a scalar Tensor
      `t` into a Tensor of state derivatives with respect to time.
    y0: N-D Tensor giving starting value of `y` at time point `t[0]`.
    t: 1-D Tensor holding a sequence of time points for which to solve for
      `y`. The initial time point should be the first element of this sequence,
      and each time must be larger than the previous time. May have any floating
      point dtype.
    dt: 0-D or 1-D Tensor providing time step suggestion to be used on time
      integration intervals in `t`. 1-D Tensor should provide values
      for all intervals, must have 1 less element than that of `t`.
      If given a 0-D Tensor, the value is interpreted as time step suggestion
      same for all intervals. If passed None, then time step is set to be the
      t[1:] - t[:-1]. Defaults to None. The actual step size is obtained by
      insuring an integer number of steps per interval, potentially reducing the
      time step.
    method: One of 'midpoint' or 'rk4'.
    name: Optional name for the resulting operation.

  Returns:
    y: (N+1)-D tensor, where the first dimension corresponds to different
      time points. Contains the solved value of y for each desired time point in
      `t`, with the initial value `y0` being the first element along the first
      dimension.

  Raises:
    ValueError: Upon caller errors.
  """
  with ops.name_scope(name, 'odeint_fixed', [y0, t, dt]):
    t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
    y0 = ops.convert_to_tensor(y0, name='y0')

    intervals = t[1:] - t[:-1]
    if dt is None:
      dt = intervals
    dt = ops.convert_to_tensor(dt, preferred_dtype=dtypes.float64, name='dt')

    steps_on_intervals = math_ops.ceil(intervals / dt)
    dt = intervals / steps_on_intervals
    steps_on_intervals = math_ops.cast(steps_on_intervals, dtype=dtypes.int32)

    _check_input_types(y0, t, dt)
    _check_input_sizes(t, dt)

    with _assert_increasing(t):
      with ops.name_scope(method):
        if method == 'midpoint':
          return _MidpointFixedGridIntegrator().integrate(func, y0, t, dt,
                                                          steps_on_intervals)
        elif method == 'rk4':
          return _RK4FixedGridIntegrator().integrate(func, y0, t, dt,
                                                     steps_on_intervals)
        else:
          raise ValueError('method not supported: {!s}'.format(method))
def percentile(x,
               q,
               axis=None,
               interpolation=None,
               keep_dims=False,
               validate_args=False,
               name=None):
  """Compute the `q`-th percentile of `x`.

  Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
  way from the minimum to the maximum in a sorted copy of `x`.

  The values and distances of the two nearest neighbors as well as the
  `interpolation` parameter will determine the percentile if the normalized
  ranking does not match the location of `q` exactly.

  This function is the same as the median if `q = 50`, the same as the minimum
  if `q = 0` and the same as the maximum if `q = 100`.


  ```python
  # Get 30th percentile with default ('nearest') interpolation.
  x = [1., 2., 3., 4.]
  percentile(x, q=30.)
  ==> 2.0

  # Get 30th percentile with 'lower' interpolation
  x = [1., 2., 3., 4.]
  percentile(x, q=30., interpolation='lower')
  ==> 1.0

  # Get 100th percentile (maximum).  By default, this is computed over every dim
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100.)
  ==> 4.0

  # Treat the leading dim as indexing samples, and find the 100th quantile (max)
  # over all such samples.
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100., axis=[0])
  ==> [3., 4.]
  ```

  Compare to `numpy.percentile`.

  Args:
    x:  Floating point `N-D` `Tensor` with `N > 0`.  If `axis` is not `None`,
      `x` must have statically known number of dimensions.
    q:  Scalar `Tensor` in `[0, 100]`. The percentile.
    axis:  Optional `0-D` or `1-D` integer `Tensor` with constant values.
      The axis that hold independent samples over which to return the desired
      percentile.  If `None` (the default), treat every dimension as a sample
      dimension, returning a scalar.
    interpolation : {"lower", "higher", "nearest"}.  Default: "nearest"
      This optional parameter specifies the interpolation method to
      use when the desired quantile lies between two data points `i < j`:
        * lower: `i`.
        * higher: `j`.
        * nearest: `i` or `j`, whichever is nearest.
    keep_dims:  Python `bool`. If `True`, the last dimension is kept with size 1
      If `False`, the last dimension is removed from the output shape.
    validate_args:  Whether to add runtime checks of argument validity.
      If False, and arguments are incorrect, correct behavior is not guaranteed.
    name:  A Python string name to give this `Op`.  Default is "percentile"

  Returns:
    A `(N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if
      `axis` is `None`, a scalar.

  Raises:
    ValueError:  If argument 'interpolation' is not an allowed type.
  """
  name = name or "percentile"
  allowed_interpolations = {"lower", "higher", "nearest"}

  if interpolation is None:
    interpolation = "nearest"
  else:
    if interpolation not in allowed_interpolations:
      raise ValueError("Argument 'interpolation' must be in %s.  Found %s" %
                       (allowed_interpolations, interpolation))

  with ops.name_scope(name, [x, q]):
    x = ops.convert_to_tensor(x, name="x")
    # Double is needed here and below, else we get the wrong index if the array
    # is huge along axis.
    q = math_ops.to_double(q, name="q")
    _get_static_ndims(q, expect_ndims=0)

    if validate_args:
      q = control_flow_ops.with_dependencies([
          check_ops.assert_rank(q, 0),
          check_ops.assert_greater_equal(q, math_ops.to_double(0.)),
          check_ops.assert_less_equal(q, math_ops.to_double(100.))
      ], q)

    if axis is None:
      y = array_ops.reshape(x, [-1])
    else:
      axis = ops.convert_to_tensor(axis, name="axis")
      check_ops.assert_integer(axis)
      axis_ndims = _get_static_ndims(
          axis, expect_static=True, expect_ndims_no_more_than=1)
      axis_const = tensor_util.constant_value(axis)
      if axis_const is None:
        raise ValueError(
            "Expected argument 'axis' to be statically available.  Found: %s" %
            axis)
      axis = axis_const
      if axis_ndims == 0:
        axis = [axis]
      axis = [int(a) for a in axis]
      x_ndims = _get_static_ndims(
          x, expect_static=True, expect_ndims_at_least=1)
      axis = _make_static_axis_non_negative(axis, x_ndims)
      y = _move_dims_to_flat_end(x, axis, x_ndims)

    frac_at_q_or_above = 1. - q / 100.
    d = math_ops.to_double(array_ops.shape(y)[-1])

    if interpolation == "lower":
      index = math_ops.ceil((d - 1) * frac_at_q_or_above)
    elif interpolation == "higher":
      index = math_ops.floor((d - 1) * frac_at_q_or_above)
    elif interpolation == "nearest":
      index = math_ops.round((d - 1) * frac_at_q_or_above)

    # If d is gigantic, then we would have d == d - 1, even in double... So
    # let's use max/min to avoid out of bounds errors.
    d = array_ops.shape(y)[-1]
    # d - 1 will be distinct from d in int32.
    index = clip_ops.clip_by_value(math_ops.to_int32(index), 0, d - 1)

    # Sort everything, not just the top 'k' entries, which allows multiple calls
    # to sort only once (under the hood) and use CSE.
    sorted_y = _sort_tensor(y)

    # result.shape = B
    result = sorted_y[..., index]
    result.set_shape(y.get_shape()[:-1])

    if keep_dims:
      if axis is None:
        # ones_vec = [1, 1,..., 1], total length = len(S) + len(B).
        ones_vec = array_ops.ones(
            shape=[_get_best_effort_ndims(x)], dtype=dtypes.int32)
        result *= array_ops.ones(ones_vec, dtype=x.dtype)
      else:
        result = _insert_back_keep_dims(result, axis)

    return result
def auto_correlation(
    x,
    axis=-1,
    max_lags=None,
    center=True,
    normalize=True,
    name="auto_correlation"):
  """Auto correlation along one axis.

  Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation
  `RXX` may be defined as  (with `E` expectation and `Conj` complex conjugate)

  ```
  RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) },
  W[n]   := (X[n] - MU) / S,
  MU     := E{ X[0] },
  S**2   := E{ (X[0] - MU) Conj(X[0] - MU) }.
  ```

  This function takes the viewpoint that `x` is (along one axis) a finite
  sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an
  estimate of `RXX[m]` as follows:

  After extending `x` from length `L` to `inf` by zero padding, the auto
  correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as

  ```
  rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]),
  w[n]   := (x[n] - mu) / s,
  mu     := L**-1 sum_n x[n],
  s**2   := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu)
  ```

  The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users
  often set `max_lags` small enough so that the entire output is meaningful.

  Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by
  `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation
  contains a slight bias, which goes to zero as `len(x) - m --> infinity`.

  Args:
    x:  `float32` or `complex64` `Tensor`.
    axis:  Python `int`. The axis number along which to compute correlation.
      Other dimensions index different batch members.
    max_lags:  Positive `int` tensor.  The maximum value of `m` to consider
      (in equation above).  If `max_lags >= x.shape[axis]`, we effectively
      re-set `max_lags` to `x.shape[axis] - 1`.
    center:  Python `bool`.  If `False`, do not subtract the mean estimate `mu`
      from `x[n]` when forming `w[n]`.
    normalize:  Python `bool`.  If `False`, do not divide by the variance
      estimate `s**2` when forming `w[n]`.
    name:  `String` name to prepend to created ops.

  Returns:
    `rxx`: `Tensor` of same `dtype` as `x`.  `rxx.shape[i] = x.shape[i]` for
      `i != axis`, and `rxx.shape[axis] = max_lags + 1`.

  Raises:
    TypeError:  If `x` is not a supported type.
  """
  # Implementation details:
  # Extend length N / 2 1-D array x to length N by zero padding onto the end.
  # Then, set
  #   F[x]_k := sum_n x_n exp{-i 2 pi k n / N }.
  # It is not hard to see that
  #   F[x]_k Conj(F[x]_k) = F[R]_k, where
  #   R_m := sum_n x_n Conj(x_{(n - m) mod N}).
  # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m].

  # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT
  # based version of estimating RXX.
  # Note that this is a special case of the Wiener-Khinchin Theorem.
  with ops.name_scope(name, values=[x]):
    x = ops.convert_to_tensor(x, name="x")

    # Rotate dimensions of x in order to put axis at the rightmost dim.
    # FFT op requires this.
    rank = util.prefer_static_rank(x)
    if axis < 0:
      axis = rank + axis
    shift = rank - 1 - axis
    # Suppose x.shape[axis] = T, so there are T "time" steps.
    #   ==> x_rotated.shape = B + [T],
    # where B is x_rotated's batch shape.
    x_rotated = util.rotate_transpose(x, shift)

    if center:
      x_rotated -= math_ops.reduce_mean(x_rotated, axis=-1, keepdims=True)

    # x_len = N / 2 from above explanation.  The length of x along axis.
    # Get a value for x_len that works in all cases.
    x_len = util.prefer_static_shape(x_rotated)[-1]

    # TODO(langmore) Investigate whether this zero padding helps or hurts.  At
    # the moment is is necessary so that all FFT implementations work.
    # Zero pad to the next power of 2 greater than 2 * x_len, which equals
    # 2**(ceil(Log_2(2 * x_len))).  Note: Log_2(X) = Log_e(X) / Log_e(2).
    x_len_float64 = math_ops.cast(x_len, np.float64)
    target_length = math_ops.pow(
        np.float64(2.),
        math_ops.ceil(math_ops.log(x_len_float64 * 2) / np.log(2.)))
    pad_length = math_ops.cast(target_length - x_len_float64, np.int32)

    # We should have:
    # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length]
    #                     = B + [T + pad_length]
    x_rotated_pad = util.pad(x_rotated, axis=-1, back=True, count=pad_length)

    dtype = x.dtype
    if not dtype.is_complex:
      if not dtype.is_floating:
        raise TypeError("Argument x must have either float or complex dtype"
                        " found: {}".format(dtype))
      x_rotated_pad = math_ops.complex(x_rotated_pad,
                                       dtype.real_dtype.as_numpy_dtype(0.))

    # Autocorrelation is IFFT of power-spectral density (up to some scaling).
    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
    spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
    # shifted_product is R[m] from above detailed explanation.
    # It is the inner product sum_n X[n] * Conj(X[n - m]).
    shifted_product = spectral_ops.ifft(spectral_density)

    # Cast back to real-valued if x was real to begin with.
    shifted_product = math_ops.cast(shifted_product, dtype)

    # Figure out if we can deduce the final static shape, and set max_lags.
    # Use x_rotated as a reference, because it has the time dimension in the far
    # right, and was created before we performed all sorts of crazy shape
    # manipulations.
    know_static_shape = True
    if not x_rotated.shape.is_fully_defined():
      know_static_shape = False
    if max_lags is None:
      max_lags = x_len - 1
    else:
      max_lags = ops.convert_to_tensor(max_lags, name="max_lags")
      max_lags_ = tensor_util.constant_value(max_lags)
      if max_lags_ is None or not know_static_shape:
        know_static_shape = False
        max_lags = math_ops.minimum(x_len - 1, max_lags)
      else:
        max_lags = min(x_len - 1, max_lags_)

    # Chop off the padding.
    # We allow users to provide a huge max_lags, but cut it off here.
    # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags]
    shifted_product_chopped = shifted_product[..., :max_lags + 1]

    # If possible, set shape.
    if know_static_shape:
      chopped_shape = x_rotated.shape.as_list()
      chopped_shape[-1] = min(x_len, max_lags + 1)
      shifted_product_chopped.set_shape(chopped_shape)

    # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]).  The
    # other terms were zeros arising only due to zero padding.
    # `denominator = (N / 2 - m)` (defined below) is the proper term to
    # divide by by to make this an unbiased estimate of the expectation
    # E[X[n] Conj(X[n - m])].
    x_len = math_ops.cast(x_len, dtype.real_dtype)
    max_lags = math_ops.cast(max_lags, dtype.real_dtype)
    denominator = x_len - math_ops.range(0., max_lags + 1.)
    denominator = math_ops.cast(denominator, dtype)
    shifted_product_rotated = shifted_product_chopped / denominator

    if normalize:
      shifted_product_rotated /= shifted_product_rotated[..., :1]

    # Transpose dimensions back to those of x.
    return util.rotate_transpose(shifted_product_rotated, -shift)
def kernel_classifier_distance_and_std_from_activations(
        real_activations,
        generated_activations,
        max_block_size=10,
        dtype=None):
    """Kernel "classifier" distance for evaluating a generative model.

    This methods computes the kernel classifier distance from activations of
    real images and generated images. This can be used independently of the
    kernel_classifier_distance() method, especially in the case of using large
    batches during evaluation where we would like to precompute all of the
    activations before computing the classifier distance, or if we want to
    compute multiple metrics based on the same images. It also returns a rough
    estimate of the standard error of the estimator.

    This technique is described in detail in https://arxiv.org/abs/1801.01401.
    Given two distributions P and Q of activations, this function calculates

        E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
          - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]

    where k is the polynomial kernel

        k(x, y) = ( x^T y / dimension + 1 )^3.

    This captures how different the distributions of real and generated images'
    visual features are. Like the Frechet distance (and unlike the Inception
    score), this is a true distance and incorporates information about the
    target images. Unlike the Frechet score, this function computes an
    *unbiased* and asymptotically normal estimator, which makes comparing
    estimates across models much more intuitive.

    The estimator used takes time quadratic in max_block_size. Larger values of
    max_block_size will decrease the variance of the estimator but increase the
    computational cost. This differs slightly from the estimator used by the
    original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
    The estimate of the standard error will also be more reliable when there are
    more blocks, i.e. when max_block_size is smaller.

    NOTE: the blocking code assumes that real_activations and
    generated_activations are both in random order. If either is sorted in a
    meaningful order, the estimator will behave poorly.

    Args:
      real_activations: 2D Tensor containing activations of real data. Shape is
        [batch_size, activation_size].
      generated_activations: 2D Tensor containing activations of generated data.
        Shape is [batch_size, activation_size].
      max_block_size: integer, default 1024. The distance estimator splits samples
        into blocks for computational efficiency. Larger values are more
        computationally expensive but decrease the variance of the distance
        estimate. Having a smaller block size also gives a better estimate of the
        standard error.
      dtype: if not None, coerce activations to this dtype before computations.

    Returns:
     The Kernel Inception Distance. A floating-point scalar of the same type
       as the output of the activations.
     An estimate of the standard error of the distance estimator (a scalar of
       the same type).
    """

    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)
    real_activations.shape[1].assert_is_compatible_with(
        generated_activations.shape[1])

    if dtype is None:
        dtype = real_activations.dtype
        assert generated_activations.dtype == dtype
    else:
        real_activations = math_ops.cast(real_activations, dtype)
        generated_activations = math_ops.cast(generated_activations, dtype)

    # Figure out how to split the activations into blocks of approximately
    # equal size, with none larger than max_block_size.
    n_r = array_ops.shape(real_activations)[0]
    n_g = array_ops.shape(generated_activations)[0]

    n_bigger = math_ops.maximum(n_r, n_g)
    n_blocks = math_ops.to_int32(math_ops.ceil(n_bigger / max_block_size))

    v_r = n_r // n_blocks
    v_g = n_g // n_blocks

    n_plusone_r = n_r - v_r * n_blocks
    n_plusone_g = n_g - v_g * n_blocks

    sizes_r = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_r], v_r),
        array_ops.fill([n_plusone_r], v_r + 1),
    ], 0)
    sizes_g = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_g], v_g),
        array_ops.fill([n_plusone_g], v_g + 1),
    ], 0)

    zero = array_ops.zeros([1], dtype=dtypes.int32)
    inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
    inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)

    dim = math_ops.cast(tf.shape(real_activations)[1], dtype)

    def compute_kid_block(i):
        'Compute the ith block of the KID estimate.'
        r_s = inds_r[i]
        r_e = inds_r[i + 1]
        r = real_activations[r_s:r_e]
        m = math_ops.cast(r_e - r_s, dtype)

        g_s = inds_g[i]
        g_e = inds_g[i + 1]
        g = generated_activations[g_s:g_e]
        n = math_ops.cast(g_e - g_s, dtype)

        k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
        k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
        k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
        return (-2 * math_ops.reduce_mean(k_rg) +
                (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) /
                (m * (m - 1)) +
                (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n *
                                                                      (n - 1)))

    ests = functional_ops.map_fn(compute_kid_block,
                                 math_ops.range(n_blocks),
                                 dtype=dtype,
                                 back_prop=False)

    mn = math_ops.reduce_mean(ests)

    # nn_impl.moments doesn't use the Bessel correction, which we want here
    n_blocks_ = math_ops.cast(n_blocks, dtype)
    var = control_flow_ops.cond(
        math_ops.less_equal(n_blocks, 1),
        lambda: array_ops.constant(float('nan'), dtype=dtype),
        lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) /
        (n_blocks_ - 1))

    return mn, math_ops.sqrt(var / n_blocks_)
def polynomial_decay(exploration_rate,
                     timestep,
                     decay_steps,
                     end_exploration_rate=0.0001,
                     power=1.0,
                     cycle=False,
                     name=None):
    """Applies a polynomial decay to the exploration rate.

    It is commonly observed that a monotonically decreasing exploration rate, whose
    degree of change is carefully chosen, results in a better performing model.
    This function applies a polynomial decay function to a provided initial
    `exploration_rate` to reach an `end_exploration_rate` in the given `decay_steps`.

    It requires a `timestep` value to compute the decayed exploration rate.  You
    can just pass a TensorFlow variable that you increment at each training step.

    The function returns the decayed exploration rate.  It is computed as:

    ```python
    >>> timestep = min(timestep, decay_steps)
    >>> decayed_exploration_rate = (exploration_rate - end_exploration_rate) *
    ...                            (1 - timestep / decay_steps) ^ (power) + end_exploration_rate
    ```

    If `cycle` is True then a multiple of `decay_steps` is used, the first one
    that is bigger than `timesteps`.

    ```python
    >>> decay_steps = decay_steps * ceil(timestep / decay_steps)
    >>> decayed_exploration_rate = (exploration_rate - end_exploration_rate) *
    ...                            (1 - timestep / decay_steps) ^ (power) +
    ...                            end_exploration_rate

    ```

    Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

    ```python
    >>> timestep = tf.Variable(0, trainable=False)
    >>> starter_exploration_rate = 0.1
    >>> end_exploration_rate = 0.01
    >>> decay_steps = 10000
    >>> exploration_rate = tf.train.polynomial_decay(starter_exploration_rate, timestep,
    ...                                              decay_steps, end_exploration_rate, power=0.5)
    >>> # Passing timestep to minimize() will increment it at each step.
    >>> learning_step = (
    ...     tf.train.GradientDescentOptimizer(exploration_rate)
    ...     .minimize(...my loss..., timestep=timestep)
    ... )
    ```

    Args:
        exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The initial exploration rate.
        timestep: A scalar `int32` or `int64` `Tensor` or a Python number.
            Global step to use for the decay computation.  Must not be negative.
        decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
            Must be positive.  See the decay computation above.
        end_exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The minimal end exploration rate.
        power: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The power of the polynomial. Defaults to linear, 1.0.
        cycle: A boolean, whether or not it should cycle beyond decay_steps.
        name: String.  Optional name of the operation. Defaults to
            'PolynomialDecay'.

    Returns:
        A scalar `Tensor` of the same type as `exploration_rate`.  The decayed exploration rate.

    Raises:
        ValueError: if `timestep` is not supplied.
    """
    if timestep is None:
        raise ValueError("timestep is required for polynomial_decay.")
    with get_name_scope(name=name,
                        scope="PolynomialDecay",
                        values=[
                            exploration_rate, timestep, decay_steps,
                            end_exploration_rate, power
                        ]) as name:
        exploration_rate = ops.convert_to_tensor(exploration_rate,
                                                 name="exploration_rate")
        dtype = exploration_rate.dtype
        timestep = math_ops.cast(timestep, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        end_exploration_rate = math_ops.cast(end_exploration_rate, dtype)
        power = math_ops.cast(power, dtype)
        if cycle:
            # Find the first multiple of decay_steps that is bigger than timestep.
            decay_steps = math_ops.multiply(
                decay_steps, math_ops.ceil(timestep / decay_steps))
        else:
            # Make sure that the timestep used is not bigger than decay_steps.
            timestep = math_ops.minimum(timestep, decay_steps)

        p = math_ops.div(timestep, decay_steps)
        return math_ops.add(math_ops.multiply(
            exploration_rate - end_exploration_rate,
            math_ops.pow(1 - p, power)),
                            end_exploration_rate,
                            name=name)
def estimator_model_fn(features, labels, mode, params):
    """The estimator function"""
    input_layer_source = tf.feature_column.input_layer(
        {"x_s": features['x_s']}, params['feature_columns'][0])
    input_layer_target = tf.feature_column.input_layer(
        {"x_t": features['x_t']}, params['feature_columns'][1])
    # Reshape
    input_layer_source = tf.reshape(input_layer_source,
                                    [-1, 32, 32, FLAGS.channel_size])
    input_layer_target = tf.reshape(input_layer_target,
                                    [-1, 32, 32, FLAGS.channel_size])

    y_s = tf.cast(labels['y_s'], tf.int32)
    y_t = tf.cast(labels['y_t'], tf.int32)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # TODO: To be implemented
        return
    if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
        training = mode == tf.estimator.ModeKeys.TRAIN
        iter_ratio = params['iter_ratio']
        current_epoch = math_ops.ceil(
            math_ops.divide(tf.train.get_global_step(), iter_ratio))
        alpha = utilities.reverse_gradient_weight(current_epoch,
                                                  FLAGS.total_epochs, 10.)
        # Apply DANN model
        class_logits_source, domain_logits_source = dann_model_fn(
            input_layer_source, alpha=alpha, is_training=training)
        class_logits_target, domain_logits_target = dann_model_fn(
            input_layer_target, alpha=alpha, is_training=training)
        # Get predicitons for accuracy
        pred_classes_target = tf.argmax(class_logits_target,
                                        axis=1,
                                        output_type=tf.int32)
        pred_classes_source = tf.argmax(class_logits_source,
                                        axis=1,
                                        output_type=tf.int32)
        # Create domain labels
        domain_labels_source = tf.zeros([tf.shape(features['x_s'])[0]],
                                        dtype=tf.int32)
        domain_labels_target = tf.ones([tf.shape(features['x_t'])[0]],
                                       dtype=tf.int32)
        # Compute losses
        class_loss = tf.losses.sparse_softmax_cross_entropy(
            labels=y_s, logits=class_logits_source)
        domain_loss_source = tf.losses.sparse_softmax_cross_entropy(
            labels=domain_labels_source, logits=domain_logits_source)
        if FLAGS.source_only == 'False':
            domain_loss_target = tf.losses.sparse_softmax_cross_entropy(
                labels=domain_labels_target, logits=domain_logits_target)
            total_loss = tf.reduce_mean(class_loss) + tf.reduce_mean(domain_loss_source) + \
                         tf.reduce_mean(domain_loss_target)
        else:
            total_loss = tf.reduce_mean(class_loss)

        if mode == tf.estimator.ModeKeys.EVAL:
            source_class_acc = tf.metrics.accuracy(
                labels=y_s,
                predictions=pred_classes_source,
                name='source_class_acc_op')
            target_class_acc = tf.metrics.accuracy(
                labels=y_t,
                predictions=pred_classes_target,
                name='target_class_acc_op')
            metrics = {
                'source_class_acc': source_class_acc,
                'target_class_acc': target_class_acc
            }
            return tf.estimator.EstimatorSpec(mode,
                                              loss=total_loss,
                                              eval_metric_ops=metrics)

        # Calculate a non streaming (per batch) accuracy
        source_class_acc = utilities.non_streaming_accuracy(
            pred_classes_source, y_s)
        target_class_acc = utilities.non_streaming_accuracy(
            pred_classes_target, y_t)

        # Initialize learning rate
        if FLAGS.annealing.lower() == 'true':
            learning_rate = utilities.lr_annealing(
                learning_rate=FLAGS.base_learning_rate,
                current_epoch=current_epoch,
                total_epochs=FLAGS.total_epochs,
                alpha=10,
                beta=0.75)
        else:
            learning_rate = FLAGS.base_learning_rate
        tf.identity(learning_rate, 'learning_rate')
        tf.identity(alpha, 'alpha')
        tf.identity(total_loss, 'loss')
        tf.identity(source_class_acc, 'source_class_acc')
        tf.identity(target_class_acc, 'target_class_acc')
        # TensorBoard
        tf.summary.scalar('Train_source_acc', source_class_acc)
        tf.summary.scalar('Train_target_acc', target_class_acc)
        tf.summary.scalar('Learning_rate', learning_rate)
        tf.summary.scalar('Alpha', alpha)
        tf.summary.merge_all()

        # Optimize
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(
                total_loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          train_op=train_op)
def estimator_model_fn(features, labels, mode, params):
    """The estimator function"""
    training = mode == tf.estimator.ModeKeys.TRAIN

    input_layer_source = tf.feature_column.input_layer(
        {"x_s": features['x_s']}, params['feature_columns'][0])
    input_layer_target = tf.feature_column.input_layer(
        {"x_t": features['x_t']}, params['feature_columns'][1])

    # Reshape
    input_layer_source = tf.reshape(
        input_layer_source,
        [-1, params['source_size'], params['source_size'], FLAGS.channel_size])
    input_layer_target = tf.reshape(
        input_layer_target,
        [-1, params['target_size'], params['target_size'], FLAGS.channel_size])
    # Apply random horizontal flipping and random crops after zero padding

    y_s = tf.cast(labels['y_s'], tf.int32)
    y_t = tf.cast(labels['y_t'], tf.int32)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # TODO: To be implemented
        return
    if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
        # Prepare the three different pipelines
        if training:
            input_layer_source_aug = augment_input(input_layer_source,
                                                   params['source_size'])
            input_layer_target_aug_student = augment_input(
                input_layer_target, params['source_size'])
            input_layer_target_aug_teacher = augment_input(
                input_layer_target, params['source_size'])
            with tf.control_dependencies([input_layer_source_aug]):
                start = tf.timestamp()
        else:
            input_layer_source_aug = input_layer_source
            input_layer_target_aug_student = input_layer_target
            input_layer_target_aug_teacher = input_layer_target

        # Initialize the exponential moving average
        ema = tf.train.ExponentialMovingAverage(decay=0.99)
        # Apply self ensembling for the student network
        class_logits_source_student = self_ensembling_fn(
            input_layer_source_aug, scope='classifier', is_training=training)
        # class_logits_source_teacher = self_ensembling_fn(input_layer_source_aug, scope='classifier',
        #                                                  is_training=training, getter=get_getter(ema))

        var_class = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        class_logits_target_student = self_ensembling_fn(
            input_layer_target_aug_student,
            scope='classifier',
            is_training=training)
        class_logits_target_teacher = self_ensembling_fn(
            input_layer_target_aug_teacher,
            scope='classifier',
            is_training=training,
            getter=get_getter(ema))
        # with tf.control_dependencies([class_logits_target_teacher]):
        #     class_logits_target_teacher = tf.Print(class_logits_target_teacher, [start-tf.timestamp()], "Current time: ", summarize=1000)
        # Get predictions for accuracy
        pred_classes_source_student = tf.argmax(class_logits_source_student,
                                                axis=1,
                                                output_type=tf.int32)
        # pred_classes_source_teacher = tf.argmax(class_logits_source_teacher, axis=1, output_type=tf.int32)
        pred_classes_target_student = tf.argmax(class_logits_target_student,
                                                axis=1,
                                                output_type=tf.int32)
        pred_classes_target_teacher = tf.argmax(class_logits_target_teacher,
                                                axis=1,
                                                output_type=tf.int32)

        # Compute losses
        class_loss_source = tf.losses.sparse_softmax_cross_entropy(
            labels=y_s, logits=class_logits_source_student)
        if training:
            if FLAGS.rampup_epochs > 0:
                squared_difference_loss = tf.losses.mean_squared_error(
                    tf.nn.softmax(class_logits_target_student),
                    tf.nn.softmax(class_logits_target_teacher))
            else:
                max_target_teacher = tf.reduce_max(class_logits_target_teacher,
                                                   axis=1)
                binary_mask = tf.cast(
                    math_ops.greater(max_target_teacher,
                                     FLAGS.confidence_threshold), tf.float32)
                loss = class_logits_target_student - class_logits_target_teacher
                loss = loss * loss
                loss = tf.reduce_mean(loss, axis=1)
                squared_difference_loss = tf.reduce_mean(loss * binary_mask)
        # Ramp up squared difference loss
        if training:
            if FLAGS.rampup_epochs > 0:
                iter_ratio = params['iter_ratio']
                current_epoch = math_ops.ceil(
                    math_ops.divide(tf.train.get_global_step(), iter_ratio))
                rampup = utilities.calculate_ramp_up(current_epoch,
                                                     FLAGS.rampup_epochs)
                squared_difference_loss = squared_difference_loss * rampup

            # Compute weighted loss
            total_loss = class_loss_source + squared_difference_loss * FLAGS.self_ensembling_loss_weight
        else:
            total_loss = class_loss_source

        if mode == tf.estimator.ModeKeys.EVAL:
            class_logits_source_teacher = self_ensembling_fn(
                input_layer_source_aug,
                scope='classifier',
                is_training=training,
                getter=get_getter(ema))
            pred_classes_source_teacher = tf.argmax(
                class_logits_source_teacher, axis=1, output_type=tf.int32)
            source_class_acc = tf.metrics.accuracy(
                labels=y_s,
                predictions=pred_classes_source_teacher,
                name='source_class_acc_op')
            target_class_acc = tf.metrics.accuracy(
                labels=y_t,
                predictions=pred_classes_target_teacher,
                name='source_class_acc_op')
            metrics = {
                'source_class_acc': source_class_acc,
                'target_class_acc': target_class_acc
            }
            return tf.estimator.EstimatorSpec(mode,
                                              loss=total_loss,
                                              eval_metric_ops=metrics)

        # Calculate a non streaming (per batch) accuracy
        source_class_acc_student = utilities.non_streaming_accuracy(
            pred_classes_source_student, y_s)
        target_class_acc_student = utilities.non_streaming_accuracy(
            pred_classes_target_student, y_t)
        # source_class_acc_teacher = utilities.non_streaming_accuracy(pred_classes_source_teacher, y_s)
        target_class_acc_teacher = utilities.non_streaming_accuracy(
            pred_classes_target_teacher, y_t)

        tf.identity(FLAGS.learning_rate, 'learning_rate')
        tf.identity(total_loss, 'loss')
        tf.identity(source_class_acc_student, 'source_class_acc_student')
        tf.identity(target_class_acc_student, 'target_class_acc_student')
        # tf.identity(source_class_acc_teacher, 'source_class_acc_teacher')
        tf.identity(target_class_acc_teacher, 'target_class_acc_teacher')
        # TensorBoard
        tf.summary.scalar('Train_source_acc_student', source_class_acc_student)
        tf.summary.scalar('Train_target_acc_student', target_class_acc_student)
        # tf.summary.scalar('Train_source_acc_teacher', source_class_acc_teacher)
        tf.summary.scalar('Train_target_acc_teacher', target_class_acc_teacher)
        tf.summary.scalar('Learning_rate', FLAGS.learning_rate)
        tf.summary.merge_all()

        # Optimize
        optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(
                total_loss, global_step=tf.train.get_global_step())
        with tf.control_dependencies([train_op]):
            ema_op = ema.apply(var_class)
        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          train_op=ema_op)
def kernel_classifier_distance_and_std_from_activations(real_activations,
                                                        generated_activations,
                                                        max_block_size=1024,
                                                        dtype=None):
  """Kernel "classifier" distance for evaluating a generative model.

  This methods computes the kernel classifier distance from activations of
  real images and generated images. This can be used independently of the
  kernel_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like to precompute all of the
  activations before computing the classifier distance, or if we want to
  compute multiple metrics based on the same images. It also returns a rough
  estimate of the standard error of the estimator.

  This technique is described in detail in https://arxiv.org/abs/1801.01401.
  Given two distributions P and Q of activations, this function calculates

      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]

  where k is the polynomial kernel

      k(x, y) = ( x^T y / dimension + 1 )^3.

  This captures how different the distributions of real and generated images'
  visual features are. Like the Frechet distance (and unlike the Inception
  score), this is a true distance and incorporates information about the
  target images. Unlike the Frechet score, this function computes an
  *unbiased* and asymptotically normal estimator, which makes comparing
  estimates across models much more intuitive.

  The estimator used takes time quadratic in max_block_size. Larger values of
  max_block_size will decrease the variance of the estimator but increase the
  computational cost. This differs slightly from the estimator used by the
  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
  The estimate of the standard error will also be more reliable when there are
  more blocks, i.e. when max_block_size is smaller.

  NOTE: the blocking code assumes that real_activations and
  generated_activations are both in random order. If either is sorted in a
  meaningful order, the estimator will behave poorly.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].
    max_block_size: integer, default 1024. The distance estimator splits samples
      into blocks for computational efficiency. Larger values are more
      computationally expensive but decrease the variance of the distance
      estimate. Having a smaller block size also gives a better estimate of the
      standard error.
    dtype: If not None, coerce activations to this dtype before computations.

  Returns:
   The Kernel Inception Distance. A floating-point scalar of the same type
     as the output of the activations.
   An estimate of the standard error of the distance estimator (a scalar of
     the same type).
  """

  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)
  real_activations.shape[1].assert_is_compatible_with(
      generated_activations.shape[1])

  if dtype is None:
    dtype = real_activations.dtype
    assert generated_activations.dtype == dtype
  else:
    real_activations = math_ops.cast(real_activations, dtype)
    generated_activations = math_ops.cast(generated_activations, dtype)

  # Figure out how to split the activations into blocks of approximately
  # equal size, with none larger than max_block_size.
  n_r = array_ops.shape(real_activations)[0]
  n_g = array_ops.shape(generated_activations)[0]

  n_bigger = math_ops.maximum(n_r, n_g)
  n_blocks = math_ops.to_int32(math_ops.ceil(n_bigger / max_block_size))

  v_r = n_r // n_blocks
  v_g = n_g // n_blocks

  n_plusone_r = n_r - v_r * n_blocks
  n_plusone_g = n_g - v_g * n_blocks

  sizes_r = array_ops.concat([
      array_ops.fill([n_blocks - n_plusone_r], v_r),
      array_ops.fill([n_plusone_r], v_r + 1),
  ], 0)
  sizes_g = array_ops.concat([
      array_ops.fill([n_blocks - n_plusone_g], v_g),
      array_ops.fill([n_plusone_g], v_g + 1),
  ], 0)

  zero = array_ops.zeros([1], dtype=dtypes.int32)
  inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
  inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)

  dim = math_ops.cast(real_activations.shape[1], dtype)

  def compute_kid_block(i):
    """Computes the ith block of the KID estimate."""
    r_s = inds_r[i]
    r_e = inds_r[i + 1]
    r = real_activations[r_s:r_e]
    m = math_ops.cast(r_e - r_s, dtype)

    g_s = inds_g[i]
    g_e = inds_g[i + 1]
    g = generated_activations[g_s:g_e]
    n = math_ops.cast(g_e - g_s, dtype)

    k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
    k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
    k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
    return (-2 * math_ops.reduce_mean(k_rg) +
            (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) +
            (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1)))

  ests = map_fn.map_fn(
      compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False)

  mn = math_ops.reduce_mean(ests)

  # nn_impl.moments doesn't use the Bessel correction, which we want here
  n_blocks_ = math_ops.cast(n_blocks, dtype)
  var = control_flow_ops.cond(
      math_ops.less_equal(n_blocks, 1),
      lambda: array_ops.constant(float('nan'), dtype=dtype),
      lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) / (n_blocks_ - 1))

  return mn, math_ops.sqrt(var / n_blocks_)
    def __call__(self, step: int):
        """
        Call function from optimizer function.

        Args:
            step (int): step
        """
        with ops.name_scope_v2(
            self.name or "PolynomialDecayWithWarmup"
        ) as name:
            initial_learning_rate = ops.convert_to_tensor_v2(
                self.initial_learning_rate, name="initial_learning_rate"
            )
            dtype = initial_learning_rate.dtype
            end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
            power = math_ops.cast(self.power, dtype)
            warm_up_steps = math_ops.cast(self.warm_up_steps, dtype)
            start_warmup_step = math_ops.cast(self.start_warmup_step, dtype)

            global_step_recomp = math_ops.cast(step, dtype)
            decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
            if self.cycle:
                # Find the first multiple of decay_steps that is bigger than
                # global_step. If global_step is zero set the multiplier to 1
                multiplier = control_flow_ops.cond(
                    math_ops.equal(global_step_recomp, 0),
                    lambda: 1.0,
                    lambda: math_ops.ceil(
                        global_step_recomp / self.decay_steps
                    ),
                )
                decay_steps_recomp = math_ops.multiply(
                    decay_steps_recomp, multiplier
                )
            else:
                # Make sure that the global_step used is not bigger than decay_steps.
                global_step_recomp = math_ops.minimum(
                    global_step_recomp, decay_steps_recomp
                )

            p = math_ops.divide(global_step_recomp, decay_steps_recomp)
            decay_learning_rate = math_ops.multiply(
                initial_learning_rate - end_learning_rate,
                math_ops.pow(1 - p, power),
            )

            global_step_warmup = math_ops.sub(
                global_step_recomp, start_warmup_step
            )
            warmup_percent_done = math_ops.divide(
                global_step_warmup, warm_up_steps
            )
            warmup_learning_rate = math_ops.multiply(
                initial_learning_rate, warmup_percent_done,
            )
            learning_rate = control_flow_ops.cond(
                math_ops.greater(global_step_warmup, warm_up_steps),
                lambda: decay_learning_rate,
                lambda: warmup_learning_rate,
            )
            return learning_rate
def kernel_classifier_distance_and_std_from_activations(
        real_activations,
        generated_activations,
        max_block_size=10,
        dtype=None):

    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)
    real_activations.shape[1].assert_is_compatible_with(
        generated_activations.shape[1])

    if dtype is None:
        dtype = real_activations.dtype
        assert generated_activations.dtype == dtype
    else:
        real_activations = math_ops.cast(real_activations, dtype)
        generated_activations = math_ops.cast(generated_activations, dtype)

    n_r = array_ops.shape(real_activations)[0]
    n_g = array_ops.shape(generated_activations)[0]

    n_bigger = math_ops.maximum(n_r, n_g)
    n_blocks = math_ops.to_int32(math_ops.ceil(n_bigger / max_block_size))

    v_r = n_r // n_blocks
    v_g = n_g // n_blocks

    n_plusone_r = n_r - v_r * n_blocks
    n_plusone_g = n_g - v_g * n_blocks

    sizes_r = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_r], v_r),
        array_ops.fill([n_plusone_r], v_r + 1),
    ], 0)
    sizes_g = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_g], v_g),
        array_ops.fill([n_plusone_g], v_g + 1),
    ], 0)

    zero = array_ops.zeros([1], dtype=dtypes.int32)
    inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
    inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)

    dim = math_ops.cast(tf.shape(real_activations)[1], dtype)

    def compute_kid_block(i):
        r_s = inds_r[i]
        r_e = inds_r[i + 1]
        r = real_activations[r_s:r_e]
        m = math_ops.cast(r_e - r_s, dtype)

        g_s = inds_g[i]
        g_e = inds_g[i + 1]
        g = generated_activations[g_s:g_e]
        n = math_ops.cast(g_e - g_s, dtype)

        k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
        k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
        k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
        return (-2 * math_ops.reduce_mean(k_rg) +
                (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) /
                (m * (m - 1)) +
                (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n *
                                                                      (n - 1)))

    ests = functional_ops.map_fn(compute_kid_block,
                                 math_ops.range(n_blocks),
                                 dtype=dtype,
                                 back_prop=False)

    mn = math_ops.reduce_mean(ests)

    n_blocks_ = math_ops.cast(n_blocks, dtype)
    var = control_flow_ops.cond(
        math_ops.less_equal(n_blocks, 1),
        lambda: array_ops.constant(float('nan'), dtype=dtype),
        lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) /
        (n_blocks_ - 1))

    return mn, math_ops.sqrt(var / n_blocks_)
  def build(self, input_shape):
    """Builds the layer.

    Creates the variables for the network modeling the densities, creates the
    auxiliary loss estimating the median and tail quantiles of the densities,
    and then uses that to create the probability mass functions and the update
    op that produces the discrete cumulative density functions used by the range
    coder.

    Args:
      input_shape: Shape of the input tensor, used to get the number of
        channels.

    Raises:
      ValueError: if `input_shape` doesn't specify the length of the channel
        dimension.
    """
    input_shape = tensor_shape.TensorShape(input_shape)
    channel_axis = self._channel_axis(input_shape.ndims)
    channels = input_shape[channel_axis].value
    if channels is None:
      raise ValueError("The channel dimension of the inputs must be defined.")
    self.input_spec = base_layer.InputSpec(
        ndim=input_shape.ndims, axes={channel_axis: channels})
    filters = (1,) + self.filters + (1,)
    scale = self.init_scale ** (1 / (len(self.filters) + 1))

    # Create variables.
    self._matrices = []
    self._biases = []
    self._factors = []
    for i in range(len(self.filters) + 1):
      init = np.log(np.expm1(1 / scale / filters[i + 1]))
      matrix = self.add_variable(
          "matrix_{}".format(i), dtype=self.dtype,
          shape=(channels, filters[i + 1], filters[i]),
          initializer=init_ops.Constant(init))
      matrix = nn.softplus(matrix)
      self._matrices.append(matrix)

      bias = self.add_variable(
          "bias_{}".format(i), dtype=self.dtype,
          shape=(channels, filters[i + 1], 1),
          initializer=init_ops.RandomUniform(-.5, .5))
      self._biases.append(bias)

      if i < len(self.filters):
        factor = self.add_variable(
            "factor_{}".format(i), dtype=self.dtype,
            shape=(channels, filters[i + 1], 1),
            initializer=init_ops.Zeros())
        factor = math_ops.tanh(factor)
        self._factors.append(factor)

    # To figure out what range of the densities to sample, we need to compute
    # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we
    # can't take inverses of the cumulative directly, we make it an optimization
    # problem:
    # `quantiles = argmin(|logit(cumulative) - target|)`
    # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`.
    # Taking the logit (inverse of sigmoid) of the cumulative makes the
    # representation of the right target more numerically stable.

    # Numerically stable way of computing logits of `tail_mass / 2`
    # and `1 - tail_mass / 2`.
    target = np.log(2 / self.tail_mass - 1)
    # Compute lower and upper tail quantile as well as median.
    target = constant_op.constant([-target, 0, target], dtype=self.dtype)

    def quantiles_initializer(shape, dtype=None, partition_info=None):
      del partition_info  # unused
      assert tuple(shape[1:]) == (1, 3)
      init = constant_op.constant(
          [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype)
      return array_ops.tile(init, (shape[0], 1, 1))

    quantiles = self.add_variable(
        "quantiles", shape=(channels, 1, 3), dtype=self.dtype,
        initializer=quantiles_initializer)
    logits = self._logits_cumulative(quantiles, stop_gradient=True)
    loss = math_ops.reduce_sum(abs(logits - target))
    self.add_loss(loss, inputs=None)

    # Save medians for `call`, `compress`, and `decompress`.
    self._medians = quantiles[:, :, 1:2]
    if not self.optimize_integer_offset:
      self._medians = math_ops.round(self._medians)

    # Largest distance observed between lower tail quantile and median,
    # or between median and upper tail quantile.
    minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1])
    maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians)
    minmax = math_ops.maximum(minima, maxima)
    minmax = math_ops.ceil(minmax)
    minmax = math_ops.maximum(minmax, 1)

    # Sample the density up to `minmax` around the median.
    samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype)
    samples += self._medians

    half = constant_op.constant(.5, dtype=self.dtype)
    # We strip the sigmoid from the end here, so we can use the special rule
    # below to only compute differences in the left tail of the sigmoid.
    # This increases numerical stability (see explanation in `call`).
    lower = self._logits_cumulative(samples - half, stop_gradient=True)
    upper = self._logits_cumulative(samples + half, stop_gradient=True)
    # Flip signs if we can move more towards the left tail of the sigmoid.
    sign = -math_ops.sign(math_ops.add_n([lower, upper]))
    pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower))
    # Add tail masses to first and last bin of pmf, as we clip values for
    # compression, meaning that out-of-range values get mapped to these bins.
    pmf = array_ops.concat([
        math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]),
        pmf[:, 0, 1:-1],
        math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]),
        ], axis=-1)
    self._pmf = pmf

    cdf = coder_ops.pmf_to_quantized_cdf(
        pmf, precision=self.range_coder_precision)
    def cdf_getter(*args, **kwargs):
      del args, kwargs  # ignored
      return variable_scope.get_variable(
          "quantized_cdf", dtype=dtypes.int32, initializer=cdf,
          trainable=False, validate_shape=False, collections=())
    # Need to provide a fake shape here since add_variable insists on it.
    self._quantized_cdf = self.add_variable(
        "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32,
        getter=cdf_getter, trainable=False)

    update_op = state_ops.assign(
        self._quantized_cdf, cdf, validate_shape=False)
    self.add_update(update_op, inputs=None)

    super(EntropyBottleneck, self).build(input_shape)
def estimator_model_fn(features, labels, mode, params):
    """The estimator function"""
    #  features['x_s'] = tf.Print(features['x_s'], [tf.shape(features['x_s'])], "Feature source shape ")

    # input_layer_source = tf.cast(tf.reshape(features['x_s'], shape=[-1, 28, 28, 3]), dtype='float32')
    # input_layer_target = tf.cast(tf.reshape(features['x_t'], shape=[-1, 28, 28, 3]), dtype='float32')
    batch_size = tf.shape(features['x_s'])[0]
    # features['x_s'] = tf.Print(features['x_s'], [tf.shape(features['x_s'])], "Features shape..")
    input_layer_source = tf.feature_column.input_layer({"x_s": features['x_s']}, params['feature_columns'][0])
    input_layer_target = tf.feature_column.input_layer({"x_t": features['x_t']}, params['feature_columns'][1])
    # CNNs need input data to be of shape [batch_size, width, height, channel]
    input_layer_source = tf.reshape(input_layer_source, [batch_size, 28, 28, 3])
    input_layer_target = tf.reshape(input_layer_target, [batch_size, 28, 28, 3])
    # batch_size = tf.Print(batch_size, [tf.shape(input_layer_source)], "batch_size..")

    if mode == tf.estimator.ModeKeys.PREDICT:
        # TODO: To be implemented
        return
    if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL]:
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        # is_training = True
        iter_ratio = params['iter_ratio']
        current_epoch = math_ops.ceil(math_ops.divide(tf.train.get_global_step(), iter_ratio))
        alpha = utilities.reverse_gradient_weight(current_epoch, FLAGS.total_epochs, 10.)
        # Apply DANN model to both input layers
        # input_layer_source = tf.Print(input_layer_source, [tf.shape(input_layer_source)], "Input source shape ")
        class_logits_source, domain_logits_source = dann_model_fn(input_layer_source, alpha=alpha, is_training=is_training)
        class_logits_domain, domain_logits_target = dann_model_fn(input_layer_target, alpha=alpha, is_training=is_training)

        # Gotta change labels to one-hot
        with tf.control_dependencies([class_logits_source]):
            class_labels = tf.one_hot(labels['y_s'], 10)
            # domain_labels_source = tf.tile(tf.constant([0]), [tf.shape(features['x_s'])[0]])
            domain_labels_source = tf.zeros([tf.shape(features['x_s'])[0]])
            domain_labels_source = tf.one_hot(domain_labels_source, 2)
            # domain_labels_target = tf.tile(tf.constant([1]), [tf.shape(features['x_t'])[0]])
            domain_labels_target = tf.ones([tf.shape(features['x_t'])[0]])
            domain_labels_target = tf.one_hot(domain_labels_target, 2)

        # Compute losses
        class_loss = tf.losses.softmax_cross_entropy(class_labels, logits=class_logits_source)
        domain_loss_source = tf.losses.softmax_cross_entropy(domain_labels_source, logits=domain_logits_source)
        domain_loss_target = tf.losses.softmax_cross_entropy(domain_labels_target, logits=domain_logits_target)
        total_loss = tf.reduce_mean(class_loss) + tf.reduce_mean(domain_loss_source) + tf.reduce_mean(domain_loss_target)

        # Get predicted classes
        predicted_classes_source = tf.argmax(class_logits_source, axis=1, output_type=tf.int32)
        predicted_classes_domain = tf.argmax(class_logits_domain, axis=1, output_type=tf.int32)

        # Evaluate if in EVAL
        if mode == tf.estimator.ModeKeys.EVAL:
            source_class_acc = tf.metrics.accuracy(labels=labels['y_s'],
                                                   predictions=predicted_classes_source,
                                                   name='source_class_acc_op')
            target_class_acc = tf.metrics.accuracy(labels=labels['y_t'],
                                                   predictions=predicted_classes_domain,
                                                   name='target_class_acc_op')
            metrics = {'source_class_acc': source_class_acc, 'target_class_acc': target_class_acc}
            source_class_acc_2 = utilities.non_streaming_accuracy(predicted_classes_source, tf.cast(labels['y_s'], tf.int32))
            target_class_acc_2 = utilities.non_streaming_accuracy(predicted_classes_domain, tf.cast(labels['y_t'], tf.int32))
            tf.identity(source_class_acc_2, 'source_class_acc_2')
            tf.identity(target_class_acc_2, 'target_class_acc_2')
            return tf.estimator.EstimatorSpec(
                mode, loss=total_loss, eval_metric_ops=metrics)

        # Calculate a non streaming (per batch) accuracy
        source_class_acc = utilities.non_streaming_accuracy(predicted_classes_source, tf.cast(labels['y_s'], tf.int32))
        target_class_acc = utilities.non_streaming_accuracy(predicted_classes_domain, tf.cast(labels['y_t'], tf.int32))

        # Initialize learning rate
        learning_rate = utilities.lr_annealing(learning_rate=FLAGS.base_learning_rate,
                                               current_epoch=current_epoch,
                                               total_epochs=FLAGS.total_epochs,
                                               alpha=10,
                                               beta=0.75)
        tf.identity(learning_rate, 'learning_rate')
        tf.identity(total_loss, 'loss')
        tf.identity(source_class_acc, 'source_class_acc')
        tf.identity(target_class_acc, 'target_class_acc')
        optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(total_loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
Exemple #41
0
def auto_correlation(
    x,
    axis=-1,
    max_lags=None,
    center=True,
    normalize=True,
    name="auto_correlation"):
  """Auto correlation along one axis.

  Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation
  `RXX` may be defined as  (with `E` expectation and `Conj` complex conjugate)

  ```
  RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) },
  W[n]   := (X[n] - MU) / S,
  MU     := E{ X[0] },
  S**2   := E{ (X[0] - MU) Conj(X[0] - MU) }.
  ```

  This function takes the viewpoint that `x` is (along one axis) a finite
  sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an
  estimate of `RXX[m]` as follows:

  After extending `x` from length `L` to `inf` by zero padding, the auto
  correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as

  ```
  rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]),
  w[n]   := (x[n] - mu) / s,
  mu     := L**-1 sum_n x[n],
  s**2   := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu)
  ```

  The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users
  often set `max_lags` small enough so that the entire output is meaningful.

  Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by
  `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation
  contains a slight bias, which goes to zero as `len(x) - m --> infinity`.

  Args:
    x:  `float32` or `complex64` `Tensor`.
    axis:  Python `int`. The axis number along which to compute correlation.
      Other dimensions index different batch members.
    max_lags:  Positive `int` tensor.  The maximum value of `m` to consider
      (in equation above).  If `max_lags >= x.shape[axis]`, we effectively
      re-set `max_lags` to `x.shape[axis] - 1`.
    center:  Python `bool`.  If `False`, do not subtract the mean estimate `mu`
      from `x[n]` when forming `w[n]`.
    normalize:  Python `bool`.  If `False`, do not divide by the variance
      estimate `s**2` when forming `w[n]`.
    name:  `String` name to prepend to created ops.

  Returns:
    `rxx`: `Tensor` of same `dtype` as `x`.  `rxx.shape[i] = x.shape[i]` for
      `i != axis`, and `rxx.shape[axis] = max_lags + 1`.

  Raises:
    TypeError:  If `x` is not a supported type.
  """
  # Implementation details:
  # Extend length N / 2 1-D array x to length N by zero padding onto the end.
  # Then, set
  #   F[x]_k := sum_n x_n exp{-i 2 pi k n / N }.
  # It is not hard to see that
  #   F[x]_k Conj(F[x]_k) = F[R]_k, where
  #   R_m := sum_n x_n Conj(x_{(n - m) mod N}).
  # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m].

  # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT
  # based version of estimating RXX.
  # Note that this is a special case of the Wiener-Khinchin Theorem.
  with ops.name_scope(name, values=[x]):
    x = ops.convert_to_tensor(x, name="x")

    # Rotate dimensions of x in order to put axis at the rightmost dim.
    # FFT op requires this.
    rank = util.prefer_static_rank(x)
    if axis < 0:
      axis = rank + axis
    shift = rank - 1 - axis
    # Suppose x.shape[axis] = T, so there are T "time" steps.
    #   ==> x_rotated.shape = B + [T],
    # where B is x_rotated's batch shape.
    x_rotated = util.rotate_transpose(x, shift)

    if center:
      x_rotated -= math_ops.reduce_mean(x_rotated, axis=-1, keepdims=True)

    # x_len = N / 2 from above explanation.  The length of x along axis.
    # Get a value for x_len that works in all cases.
    x_len = util.prefer_static_shape(x_rotated)[-1]

    # TODO (langmore) Investigate whether this zero padding helps or hurts.  At id:595 gh:596
    # the moment is is necessary so that all FFT implementations work.
    # Zero pad to the next power of 2 greater than 2 * x_len, which equals
    # 2**(ceil(Log_2(2 * x_len))).  Note: Log_2(X) = Log_e(X) / Log_e(2).
    x_len_float64 = math_ops.cast(x_len, np.float64)
    target_length = math_ops.pow(
        np.float64(2.),
        math_ops.ceil(math_ops.log(x_len_float64 * 2) / np.log(2.)))
    pad_length = math_ops.cast(target_length - x_len_float64, np.int32)

    # We should have:
    # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length]
    #                     = B + [T + pad_length]
    x_rotated_pad = util.pad(x_rotated, axis=-1, back=True, count=pad_length)

    dtype = x.dtype
    if not dtype.is_complex:
      if not dtype.is_floating:
        raise TypeError("Argument x must have either float or complex dtype"
                        " found: {}".format(dtype))
      x_rotated_pad = math_ops.complex(x_rotated_pad,
                                       dtype.real_dtype.as_numpy_dtype(0.))

    # Autocorrelation is IFFT of power-spectral density (up to some scaling).
    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
    spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
    # shifted_product is R[m] from above detailed explanation.
    # It is the inner product sum_n X[n] * Conj(X[n - m]).
    shifted_product = spectral_ops.ifft(spectral_density)

    # Cast back to real-valued if x was real to begin with.
    shifted_product = math_ops.cast(shifted_product, dtype)

    # Figure out if we can deduce the final static shape, and set max_lags.
    # Use x_rotated as a reference, because it has the time dimension in the far
    # right, and was created before we performed all sorts of crazy shape
    # manipulations.
    know_static_shape = True
    if not x_rotated.shape.is_fully_defined():
      know_static_shape = False
    if max_lags is None:
      max_lags = x_len - 1
    else:
      max_lags = ops.convert_to_tensor(max_lags, name="max_lags")
      max_lags_ = tensor_util.constant_value(max_lags)
      if max_lags_ is None or not know_static_shape:
        know_static_shape = False
        max_lags = math_ops.minimum(x_len - 1, max_lags)
      else:
        max_lags = min(x_len - 1, max_lags_)

    # Chop off the padding.
    # We allow users to provide a huge max_lags, but cut it off here.
    # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags]
    shifted_product_chopped = shifted_product[..., :max_lags + 1]

    # If possible, set shape.
    if know_static_shape:
      chopped_shape = x_rotated.shape.as_list()
      chopped_shape[-1] = min(x_len, max_lags + 1)
      shifted_product_chopped.set_shape(chopped_shape)

    # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]).  The
    # other terms were zeros arising only due to zero padding.
    # `denominator = (N / 2 - m)` (defined below) is the proper term to
    # divide by by to make this an unbiased estimate of the expectation
    # E[X[n] Conj(X[n - m])].
    x_len = math_ops.cast(x_len, dtype.real_dtype)
    max_lags = math_ops.cast(max_lags, dtype.real_dtype)
    denominator = x_len - math_ops.range(0., max_lags + 1.)
    denominator = math_ops.cast(denominator, dtype)
    shifted_product_rotated = shifted_product_chopped / denominator

    if normalize:
      shifted_product_rotated /= shifted_product_rotated[..., :1]

    # Transpose dimensions back to those of x.
    return util.rotate_transpose(shifted_product_rotated, -shift)
def polynomial_decay(learning_rate, global_step, decay_steps,
                     end_learning_rate=0.0001, power=1.0,
                     cycle=False, name=None):
  """Applies a polynomial decay to the learning rate.

  It is commonly observed that a monotonically decreasing learning rate, whose
  degree of change is carefully chosen, results in a better performing model.
  This function applies a polynomial decay function to a provided initial
  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.

  It requires a `global_step` value to compute the decayed learning rate.  You
  can just pass a TensorFlow variable that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  global_step = min(global_step, decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  If `cycle` is True then a multiple of `decay_steps` is used, the first one
  that is bigger than `global_steps`.

  ```python
  decay_steps = decay_steps * ceil(global_step / decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.1
  end_learning_rate = 0.01
  decay_steps = 10000
  learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
                                            decay_steps, end_learning_rate,
                                            power=0.5)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Must be positive.  See the decay computation above.
    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The minimal end learning rate.
    power: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
    cycle: A boolean, whether or not it should cycle beyond decay_steps.
    name: String.  Optional name of the operation. Defaults to 'PolynomialDecay'

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  """
  with ops.name_scope(name, "PolynomialDecay",
                      [learning_rate, global_step,
                       decay_steps, end_learning_rate, power]) as name:
    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
    dtype = learning_rate.dtype
    global_step = math_ops.cast(global_step, dtype)
    decay_steps = math_ops.cast(decay_steps, dtype)
    end_learning_rate = math_ops.cast(end_learning_rate, dtype)
    power = math_ops.cast(power, dtype)
    if cycle:
      # Find the first multiple of decay_steps that is bigger than global_step.
      decay_steps = math_ops.mul(decay_steps,
                                 math_ops.ceil(global_step / decay_steps))
    else:
      # Make sure that the global_step used is not bigger than decay_steps.
      global_step = math_ops.minimum(global_step, decay_steps)

    p = math_ops.div(global_step, decay_steps)
    return math_ops.add(math_ops.mul(learning_rate - end_learning_rate,
                                     math_ops.pow(1 - p, power)),
                        end_learning_rate, name=name)
Exemple #43
0
 def f(x):
     return array_ops.where_v2(x < 0, math_ops.ceil(x), math_ops.floor(x))
Exemple #44
0
def _update_confusion_matrix_variables_optimized(
    variables_to_update,
    y_true,
    y_pred,
    thresholds,
    multi_label=False,
    sample_weights=None,
    label_weights=None,
    thresholds_with_epsilon=False):
  """Update confusion matrix variables with memory efficient alternative.

  Note that the thresholds need to be evenly distributed within the list, eg,
  the diff between consecutive elements are the same.

  To compute TP/FP/TN/FN, we are measuring a binary classifier
    C(t) = (predictions >= t)
  at each threshold 't'. So we have
    TP(t) = sum( C(t) * true_labels )
    FP(t) = sum( C(t) * false_labels )

  But, computing C(t) requires computation for each t. To make it fast,
  observe that C(t) is a cumulative integral, and so if we have
    thresholds = [t_0, ..., t_{n-1}];  t_0 < ... < t_{n-1}
  where n = num_thresholds, and if we can compute the bucket function
    B(i) = Sum( (predictions == t), t_i <= t < t{i+1} )
  then we get
    C(t_i) = sum( B(j), j >= i )
  which is the reversed cumulative sum in tf.cumsum().

  We can compute B(i) efficiently by taking advantage of the fact that
  our thresholds are evenly distributed, in that
    width = 1.0 / (num_thresholds - 1)
    thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0]
  Given a prediction value p, we can map it to its bucket by
    bucket_index(p) = floor( p * (num_thresholds - 1) )
  so we can use tf.math.unsorted_segment_sum() to update the buckets in one
  pass.

  Consider following example:
  y_true = [0, 0, 1, 1]
  y_pred = [0.1, 0.5, 0.3, 0.9]
  thresholds = [0.0, 0.5, 1.0]
  num_buckets = 2   # [0.0, 1.0], (1.0, 2.0]
  bucket_index(y_pred) = tf.math.floor(y_pred * num_buckets)
                       = tf.math.floor([0.2, 1.0, 0.6, 1.8])
                       = [0, 0, 0, 1]
  # The meaning of this bucket is that if any of the label is true,
  # then 1 will be added to the corresponding bucket with the index.
  # Eg, if the label for 0.2 is true, then 1 will be added to bucket 0. If the
  # label for 1.8 is true, then 1 will be added to bucket 1.
  #
  # Note the second item "1.0" is floored to 0, since the value need to be
  # strictly larger than the bucket lower bound.
  # In the implementation, we use tf.math.ceil() - 1 to achieve this.
  tp_bucket_value = tf.math.unsorted_segment_sum(true_labels, bucket_indices,
                                                 num_segments=num_thresholds)
                  = [1, 1, 0]
  # For [1, 1, 0] here, it means there is 1 true value contributed by bucket 0,
  # and 1 value contributed by bucket 1. When we aggregate them to together,
  # the result become [a + b + c, b + c, c], since large thresholds will always
  # contribute to the value for smaller thresholds.
  true_positive = tf.math.cumsum(tp_bucket_value, reverse=True)
                = [2, 1, 0]

  This implementation exhibits a run time and space complexity of O(T + N),
  where T is the number of thresholds and N is the size of predictions.
  Metrics that rely on standard implementation instead exhibit a complexity of
  O(T * N).

  Args:
    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
      and corresponding variables to update as values.
    y_true: A floating point `Tensor` whose shape matches `y_pred`. Will be cast
      to `bool`.
    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
      the range `[0, 1]`.
    thresholds: A sorted floating point `Tensor` with value in `[0, 1]`.
      It need to be evenly distributed (the diff between each element need to be
      the same).
    multi_label: Optional boolean indicating whether multidimensional
      prediction/labels should be treated as multilabel responses, or flattened
      into a single label. When True, the valus of `variables_to_update` must
      have a second dimension equal to the number of labels in y_true and
      y_pred, and those tensors must not be RaggedTensors.
    sample_weights: Optional `Tensor` whose rank is either 0, or the same rank
      as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions
      must be either `1`, or the same as the corresponding `y_true` dimension).
    label_weights: Optional tensor of non-negative weights for multilabel
      data. The weights are applied when calculating TP, FP, FN, and TN without
      explicit multilabel handling (i.e. when the data is to be flattened).
    thresholds_with_epsilon: Optional boolean indicating whether the leading and
      tailing thresholds has any epsilon added for floating point imprecisions.
      It will change how we handle the leading and tailing bucket.

  Returns:
    Update op.
  """
  num_thresholds = thresholds.shape.as_list()[0]

  if sample_weights is None:
    sample_weights = 1.0
  else:
    sample_weights = weights_broadcast_ops.broadcast_weights(
        math_ops.cast(sample_weights, dtype=y_pred.dtype), y_pred)
    if not multi_label:
      sample_weights = array_ops.reshape(sample_weights, [-1])
  if label_weights is None:
    label_weights = 1.0
  else:
    label_weights = array_ops.expand_dims(label_weights, 0)
    label_weights = weights_broadcast_ops.broadcast_weights(label_weights,
                                                            y_pred)
    if not multi_label:
      label_weights = array_ops.reshape(label_weights, [-1])
  weights = math_ops.multiply(sample_weights, label_weights)

  # We shouldn't need this, but in case there are predict value that is out of
  # the range of [0.0, 1.0]
  y_pred = clip_ops.clip_by_value(y_pred,
                                  clip_value_min=0.0, clip_value_max=1.0)

  y_true = math_ops.cast(math_ops.cast(y_true, dtypes.bool), y_true.dtype)
  if not multi_label:
    y_true = array_ops.reshape(y_true, [-1])
    y_pred = array_ops.reshape(y_pred, [-1])

  true_labels = math_ops.multiply(y_true, weights)
  false_labels = math_ops.multiply((1.0 - y_true), weights)

  # Compute the bucket indices for each prediction value.
  # Since the predict value has to be strictly greater than the thresholds,
  # eg, buckets like [0, 0.5], (0.5, 1], and 0.5 belongs to first bucket.
  # We have to use math.ceil(val) - 1 for the bucket.
  bucket_indices = math_ops.ceil(y_pred * (num_thresholds - 1)) - 1

  if thresholds_with_epsilon:
    # In this case, the first bucket should actually take into account since
    # the any prediction between [0.0, 1.0] should be larger than the first
    # threshold. We change the bucket value from -1 to 0.
    bucket_indices = nn_ops.relu(bucket_indices)

  bucket_indices = math_ops.cast(bucket_indices, dtypes.int32)

  if multi_label:
    # We need to run bucket segment sum for each of the label class. In the
    # multi_label case, the rank of the label is 2. We first transpose it so
    # that the label dim becomes the first and we can parallel run though them.
    true_labels = array_ops.transpose_v2(true_labels)
    false_labels = array_ops.transpose_v2(false_labels)
    bucket_indices = array_ops.transpose_v2(bucket_indices)

    def gather_bucket(label_and_bucket_index):
      label, bucket_index = label_and_bucket_index[0], label_and_bucket_index[1]
      return math_ops.unsorted_segment_sum(
          data=label, segment_ids=bucket_index, num_segments=num_thresholds)
    tp_bucket_v = vectorized_map(
        gather_bucket, (true_labels, bucket_indices))
    fp_bucket_v = vectorized_map(
        gather_bucket, (false_labels, bucket_indices))
    tp = array_ops.transpose_v2(
        math_ops.cumsum(tp_bucket_v, reverse=True, axis=1))
    fp = array_ops.transpose_v2(
        math_ops.cumsum(fp_bucket_v, reverse=True, axis=1))
  else:
    tp_bucket_v = math_ops.unsorted_segment_sum(
        data=true_labels, segment_ids=bucket_indices,
        num_segments=num_thresholds)
    fp_bucket_v = math_ops.unsorted_segment_sum(
        data=false_labels, segment_ids=bucket_indices,
        num_segments=num_thresholds)
    tp = math_ops.cumsum(tp_bucket_v, reverse=True)
    fp = math_ops.cumsum(fp_bucket_v, reverse=True)

  # fn = sum(true_labels) - tp
  # tn = sum(false_labels) - fp
  if (ConfusionMatrix.TRUE_NEGATIVES in variables_to_update or
      ConfusionMatrix.FALSE_NEGATIVES in variables_to_update):
    if multi_label:
      total_true_labels = math_ops.reduce_sum(true_labels, axis=1)
      total_false_labels = math_ops.reduce_sum(false_labels, axis=1)
    else:
      total_true_labels = math_ops.reduce_sum(true_labels)
      total_false_labels = math_ops.reduce_sum(false_labels)

  update_ops = []
  if ConfusionMatrix.TRUE_POSITIVES in variables_to_update:
    variable = variables_to_update[ConfusionMatrix.TRUE_POSITIVES]
    update_ops.append(variable.assign_add(tp))
  if ConfusionMatrix.FALSE_POSITIVES in variables_to_update:
    variable = variables_to_update[ConfusionMatrix.FALSE_POSITIVES]
    update_ops.append(variable.assign_add(fp))
  if ConfusionMatrix.TRUE_NEGATIVES in variables_to_update:
    variable = variables_to_update[ConfusionMatrix.TRUE_NEGATIVES]
    tn = total_false_labels - fp
    update_ops.append(variable.assign_add(tn))
  if ConfusionMatrix.FALSE_NEGATIVES in variables_to_update:
    variable = variables_to_update[ConfusionMatrix.FALSE_NEGATIVES]
    fn = total_true_labels - tp
    update_ops.append(variable.assign_add(fn))
  return control_flow_ops.group(update_ops)
def polynomial_decay(exploration_rate, timestep, decay_steps,
                     end_exploration_rate=0.0001, power=1.0,
                     cycle=False, name=None):
    """Applies a polynomial decay to the exploration rate.

    It is commonly observed that a monotonically decreasing exploration rate, whose
    degree of change is carefully chosen, results in a better performing model.
    This function applies a polynomial decay function to a provided initial
    `exploration_rate` to reach an `end_exploration_rate` in the given `decay_steps`.

    It requires a `timestep` value to compute the decayed exploration rate.  You
    can just pass a TensorFlow variable that you increment at each training step.

    The function returns the decayed exploration rate.  It is computed as:

    ```python
    >>> timestep = min(timestep, decay_steps)
    >>> decayed_exploration_rate = (exploration_rate - end_exploration_rate) *
    ...                            (1 - timestep / decay_steps) ^ (power) + end_exploration_rate
    ```

    If `cycle` is True then a multiple of `decay_steps` is used, the first one
    that is bigger than `timesteps`.

    ```python
    >>> decay_steps = decay_steps * ceil(timestep / decay_steps)
    >>> decayed_exploration_rate = (exploration_rate - end_exploration_rate) *
    ...                            (1 - timestep / decay_steps) ^ (power) +
    ...                            end_exploration_rate

    ```

    Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

    ```python
    >>> timestep = tf.Variable(0, trainable=False)
    >>> starter_exploration_rate = 0.1
    >>> end_exploration_rate = 0.01
    >>> decay_steps = 10000
    >>> exploration_rate = tf.train.polynomial_decay(starter_exploration_rate, timestep,
    ...                                              decay_steps, end_exploration_rate, power=0.5)
    >>> # Passing timestep to minimize() will increment it at each step.
    >>> learning_step = (
    ...     tf.train.GradientDescentOptimizer(exploration_rate)
    ...     .minimize(...my loss..., timestep=timestep)
    ... )
    ```

    Args:
        exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The initial exploration rate.
        timestep: A scalar `int32` or `int64` `Tensor` or a Python number.
            Global step to use for the decay computation.  Must not be negative.
        decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
            Must be positive.  See the decay computation above.
        end_exploration_rate: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The minimal end exploration rate.
        power: A scalar `float32` or `float64` `Tensor` or a
            Python number.  The power of the polynomial. Defaults to linear, 1.0.
        cycle: A boolean, whether or not it should cycle beyond decay_steps.
        name: String.  Optional name of the operation. Defaults to
            'PolynomialDecay'.

    Returns:
        A scalar `Tensor` of the same type as `exploration_rate`.  The decayed exploration rate.

    Raises:
        ValueError: if `timestep` is not supplied.
    """
    if timestep is None:
        raise ValueError("timestep is required for polynomial_decay.")
    with get_name_scope(name=name, scope="PolynomialDecay",
                        values=[exploration_rate, timestep,
                                decay_steps, end_exploration_rate, power]) as name:
        exploration_rate = ops.convert_to_tensor(exploration_rate, name="exploration_rate")
        dtype = exploration_rate.dtype
        timestep = math_ops.cast(timestep, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        end_exploration_rate = math_ops.cast(end_exploration_rate, dtype)
        power = math_ops.cast(power, dtype)
        if cycle:
            # Find the first multiple of decay_steps that is bigger than timestep.
            decay_steps = math_ops.multiply(decay_steps,
                                            math_ops.ceil(timestep / decay_steps))
        else:
            # Make sure that the timestep used is not bigger than decay_steps.
            timestep = math_ops.minimum(timestep, decay_steps)

        p = math_ops.div(timestep, decay_steps)
        return math_ops.add(math_ops.multiply(exploration_rate - end_exploration_rate,
                                              math_ops.pow(1 - p, power)),
                            end_exploration_rate, name=name)
    def build(self, input_shape):
        """Builds the layer.

    Creates the variables for the network modeling the densities, creates the
    auxiliary loss estimating the median and tail quantiles of the densities,
    and then uses that to create the probability mass functions and the update
    op that produces the discrete cumulative density functions used by the range
    coder.

    Args:
      input_shape: Shape of the input tensor, used to get the number of
        channels.

    Raises:
      ValueError: if `input_shape` doesn't specify the length of the channel
        dimension.
    """
        input_shape = tensor_shape.TensorShape(input_shape)
        channel_axis = self._channel_axis(input_shape.ndims)
        channels = input_shape[channel_axis].value
        if channels is None:
            raise ValueError(
                "The channel dimension of the inputs must be defined.")
        self.input_spec = engine.InputSpec(ndim=input_shape.ndims,
                                           axes={channel_axis: channels})
        filters = (1, ) + self.filters + (1, )
        scale = self.init_scale**(1 / (len(self.filters) + 1))

        # Create variables.
        self._matrices = []
        self._biases = []
        self._factors = []
        for i in range(len(self.filters) + 1):
            init = np.log(np.expm1(1 / scale / filters[i + 1]))
            matrix = self.add_variable("matrix_{}".format(i),
                                       dtype=self.dtype,
                                       shape=(channels, filters[i + 1],
                                              filters[i]),
                                       initializer=init_ops.Constant(init))
            matrix = nn.softplus(matrix)
            self._matrices.append(matrix)

            bias = self.add_variable("bias_{}".format(i),
                                     dtype=self.dtype,
                                     shape=(channels, filters[i + 1], 1),
                                     initializer=init_ops.RandomUniform(
                                         -.5, .5))
            self._biases.append(bias)

            if i < len(self.filters):
                factor = self.add_variable("factor_{}".format(i),
                                           dtype=self.dtype,
                                           shape=(channels, filters[i + 1], 1),
                                           initializer=init_ops.Zeros())
                factor = math_ops.tanh(factor)
                self._factors.append(factor)

        # To figure out what range of the densities to sample, we need to compute
        # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we
        # can't take inverses of the cumulative directly, we make it an optimization
        # problem:
        # `quantiles = argmin(|logit(cumulative) - target|)`
        # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`.
        # Taking the logit (inverse of sigmoid) of the cumulative makes the
        # representation of the right target more numerically stable.

        # Numerically stable way of computing logits of `tail_mass / 2`
        # and `1 - tail_mass / 2`.
        target = np.log(2 / self.tail_mass - 1)
        # Compute lower and upper tail quantile as well as median.
        target = constant_op.constant([-target, 0, target], dtype=self.dtype)

        def quantiles_initializer(shape, dtype=None, partition_info=None):
            del partition_info  # unused
            assert tuple(shape[1:]) == (1, 3)
            init = constant_op.constant(
                [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype)
            return array_ops.tile(init, (shape[0], 1, 1))

        quantiles = self.add_variable("quantiles",
                                      shape=(channels, 1, 3),
                                      dtype=self.dtype,
                                      initializer=quantiles_initializer)
        logits = self._logits_cumulative(quantiles, stop_gradient=True)
        loss = math_ops.reduce_sum(abs(logits - target))
        self.add_loss(loss, inputs=None)

        # Save medians for `call`, `compress`, and `decompress`.
        self._medians = quantiles[:, :, 1:2]
        if not self.optimize_integer_offset:
            self._medians = math_ops.round(self._medians)

        # Largest distance observed between lower tail quantile and median,
        # or between median and upper tail quantile.
        minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1])
        maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians)
        minmax = math_ops.maximum(minima, maxima)
        minmax = math_ops.ceil(minmax)
        minmax = math_ops.maximum(minmax, 1)

        # Sample the density up to `minmax` around the median.
        samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype)
        samples += self._medians

        half = constant_op.constant(.5, dtype=self.dtype)
        # We strip the sigmoid from the end here, so we can use the special rule
        # below to only compute differences in the left tail of the sigmoid.
        # This increases numerical stability (see explanation in `call`).
        lower = self._logits_cumulative(samples - half, stop_gradient=True)
        upper = self._logits_cumulative(samples + half, stop_gradient=True)
        # Flip signs if we can move more towards the left tail of the sigmoid.
        sign = -math_ops.sign(math_ops.add_n([lower, upper]))
        pmf = abs(
            math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower))
        # Add tail masses to first and last bin of pmf, as we clip values for
        # compression, meaning that out-of-range values get mapped to these bins.
        pmf = array_ops.concat([
            math_ops.add_n([pmf[:, 0, :1],
                            math_ops.sigmoid(lower[:, 0, :1])]),
            pmf[:, 0, 1:-1],
            math_ops.add_n(
                [pmf[:, 0, -1:],
                 math_ops.sigmoid(-upper[:, 0, -1:])]),
        ],
                               axis=-1)
        self._pmf = pmf

        cdf = coder_ops.pmf_to_quantized_cdf(
            pmf, precision=self.range_coder_precision)

        def cdf_getter(*args, **kwargs):
            del args, kwargs  # ignored
            return variable_scope.get_variable("quantized_cdf",
                                               dtype=dtypes.int32,
                                               initializer=cdf,
                                               trainable=False,
                                               validate_shape=False,
                                               collections=())

        # Need to provide a fake shape here since add_variable insists on it.
        self._quantized_cdf = self.add_variable("quantized_cdf",
                                                shape=(channels, 1),
                                                dtype=dtypes.int32,
                                                getter=cdf_getter,
                                                trainable=False)

        update_op = state_ops.assign(self._quantized_cdf,
                                     cdf,
                                     validate_shape=False)
        self.add_update(update_op, inputs=None)

        super(EntropyBottleneck, self).build(input_shape)