def _phi(r, order):
  """Coordinate-wise nonlinearity used to define the order of the interpolation.

  See https://en.wikipedia.org/wiki/Polyharmonic_spline for the definition.

  Args:
    r: input op
    order: interpolation order

  Returns:
    phi_k evaluated coordinate-wise on r, for k = r
  """

  # using EPSILON prevents log(0), sqrt0), etc.
  # sqrt(0) is well-defined, but its gradient is not
  with ops.name_scope('phi'):
    if order == 1:
      r = math_ops.maximum(r, EPSILON)
      r = math_ops.sqrt(r)
      return r
    elif order == 2:
      return 0.5 * r * math_ops.log(math_ops.maximum(r, EPSILON))
    elif order == 4:
      return 0.5 * math_ops.square(r) * math_ops.log(
          math_ops.maximum(r, EPSILON))
    elif order % 2 == 0:
      r = math_ops.maximum(r, EPSILON)
      return 0.5 * math_ops.pow(r, 0.5 * order) * math_ops.log(r)
    else:
      r = math_ops.maximum(r, EPSILON)
      return math_ops.pow(r, 0.5 * order)
  def _renorm_correction_and_moments(self, mean, variance, training):
    """Returns the correction and update values for renorm."""
    stddev = math_ops.sqrt(variance + self.epsilon)
    # Compute the average mean and standard deviation, as if they were
    # initialized with this batch's moments.
    mixed_renorm_mean = (self.renorm_mean +
                         (1. - self.renorm_mean_weight) * mean)
    mixed_renorm_stddev = (self.renorm_stddev +
                           (1. - self.renorm_stddev_weight) * stddev)
    # Compute the corrections for batch renorm.
    r = stddev / mixed_renorm_stddev
    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
    # Ensure the corrections use pre-update moving averages.
    with ops.control_dependencies([r, d]):
      mean = array_ops.identity(mean)
      stddev = array_ops.identity(stddev)
    rmin, rmax, dmax = [self.renorm_clipping.get(key)
                        for key in ['rmin', 'rmax', 'dmax']]
    if rmin is not None:
      r = math_ops.maximum(r, rmin)
    if rmax is not None:
      r = math_ops.minimum(r, rmax)
    if dmax is not None:
      d = math_ops.maximum(d, -dmax)
      d = math_ops.minimum(d, dmax)
    # When not training, use r=1, d=0, and decay=1 meaning no updates.
    r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
    d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
    decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.)

    def _update_renorm_variable(var, weight, value):
      """Updates a moving average and weight, returns the unbiased value."""
      # Update the variables without zero debiasing. The debiasing will be
      # accomplished by dividing the exponential moving average by the weight.
      # For example, after a single update, the moving average would be
      # (1-decay) * value. and the weight will be 1-decay, with their ratio
      # giving value.
      # Make sure the weight is not updated until before r and d computation.
      value = array_ops.identity(value)
      with ops.control_dependencies([value]):
        weight_value = array_ops.constant(1., dtype=weight.dtype)
      new_var = moving_averages.assign_moving_average(
          var, value, decay, zero_debias=False)
      new_weight = moving_averages.assign_moving_average(
          weight, weight_value, decay, zero_debias=False)
      return new_var / new_weight

    with ops.colocate_with(self.moving_mean):
      new_mean = _update_renorm_variable(self.renorm_mean,
                                         self.renorm_mean_weight,
                                         mean)
    with ops.colocate_with(self.moving_variance):
      new_stddev = _update_renorm_variable(self.renorm_stddev,
                                           self.renorm_stddev_weight,
                                           stddev)
      # Make sqrt(moving_variance + epsilon) = new_stddev.
      new_variance = math_ops.square(new_stddev) - self.epsilon

    return (r, d, new_mean, new_variance)
Example #3
0
def _tf_range(start_or_stop, stop, step):
  # Note: for static inputs (e.g. constants), tf.range errors out at graph
  # construction time, instead of returning an empty tensor. Preventing the
  # graph construction error aligns the semantics with Python.

  # TODO(mdan): We should optimize this when a full tensor is not required.
  if step is not UNDEFINED:
    # TODO(mdan): Add argument coercion similar to other cases.
    return math_ops.range(start_or_stop, stop, step)
  if stop is not UNDEFINED:
    stop = math_ops.maximum(start_or_stop, stop)
    return math_ops.range(start_or_stop, stop)
  start_or_stop = math_ops.maximum(start_or_stop, 0)
  return math_ops.range(start_or_stop)
Example #4
0
  def _compute_power_svd(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name):
    """Computes mat_h = mat_g^alpha using svd. mat_g is a symmetric PSD matrix.

    Args:
      var: the variable we are updating.
      mat_g: the symmetric PSD matrix whose power it to be computed
      mat_g_size: size of mat_g
      alpha: a real number
      mat_h_slot_name: name of slot to store the power, if needed.

    Returns:
      mat_h = mat_g^alpha

    Stores mat_h in the appropriate slot, if it exists.
    Note that mat_g is PSD. So we could use linalg_ops.self_adjoint_eig.
    """
    if mat_g_size == 1:
      mat_h = math_ops.pow(mat_g + self._epsilon, alpha)
    else:
      damping = self._epsilon * linalg_ops.eye(math_ops.to_int32(mat_g_size))
      diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True)
      mat_h = math_ops.matmul(
          mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha),
          array_ops.transpose(mat_u))
    if mat_h_slot_name is not None:
      return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h)
    return mat_h
def _compute_vmeasure_score(labels, predictions):
  vmeasure_score = math_ops.cast(
      script_ops.py_func(
          metrics.v_measure_score, [labels, predictions], [dtypes.float64],
          name='vmeasure'),
      dtypes.float32)
  return math_ops.maximum(0.0, vmeasure_score)
Example #6
0
def clip_by_value(t, clip_value_min, clip_value_max,
                  name=None):
  """Clips tensor values to a specified min and max.

  Given a tensor `t`, this operation returns a tensor of the same type and
  shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
  Any values less than `clip_value_min` are set to `clip_value_min`. Any values
  greater than `clip_value_max` are set to `clip_value_max`.

  Args:
    t: A `Tensor`.
    clip_value_min: A 0-D (scalar) `Tensor`. The minimum value to clip by.
    clip_value_max: A 0-D (scalar) `Tensor`. The maximum value to clip by.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor`.
  """
  with ops.name_scope(name, "clip_by_value",
                      [t, clip_value_min, clip_value_max]) as name:
    t = ops.convert_to_tensor(t, name="t")

    # Go through list of tensors, for each value in each tensor clip
    t_min = math_ops.minimum(t, clip_value_max)
    t_max = math_ops.maximum(t_min, clip_value_min, name=name)

  return t_max
Example #7
0
def calculate_reshape(original_shape, new_shape, validate=False, name=None):
  """Calculates the reshaped dimensions (replacing up to one -1 in reshape)."""
  batch_shape_static = tensor_util.constant_value_as_shape(new_shape)
  if batch_shape_static.is_fully_defined():
    return np.int32(batch_shape_static.as_list()), batch_shape_static, []
  with ops.name_scope(name, "calculate_reshape", [original_shape, new_shape]):
    original_size = math_ops.reduce_prod(original_shape)
    implicit_dim = math_ops.equal(new_shape, -1)
    size_implicit_dim = (
        original_size // math_ops.maximum(1, -math_ops.reduce_prod(new_shape)))
    new_ndims = array_ops.shape(new_shape)
    expanded_new_shape = array_ops.where(  # Assumes exactly one `-1`.
        implicit_dim, array_ops.fill(new_ndims, size_implicit_dim), new_shape)
    validations = [] if not validate else [
        check_ops.assert_rank(
            original_shape, 1, message="Original shape must be a vector."),
        check_ops.assert_rank(
            new_shape, 1, message="New shape must be a vector."),
        check_ops.assert_less_equal(
            math_ops.count_nonzero(implicit_dim, dtype=dtypes.int32),
            1,
            message="At most one dimension can be unknown."),
        check_ops.assert_positive(
            expanded_new_shape, message="Shape elements must be >=-1."),
        check_ops.assert_equal(
            math_ops.reduce_prod(expanded_new_shape),
            original_size,
            message="Shape sizes do not match."),
    ]
    return expanded_new_shape, batch_shape_static, validations
Example #8
0
def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
  """Find max_norm given norm and previous average."""
  with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]):
    log_norm = math_ops.log(norm + epsilon)

    def moving_average(name, value, decay):
      moving_average_variable = vs.get_variable(
          name,
          shape=value.get_shape(),
          dtype=value.dtype,
          initializer=init_ops.zeros_initializer(),
          trainable=False)
      return moving_averages.assign_moving_average(
          moving_average_variable, value, decay, zero_debias=False)

    # quicker adaptation at the beginning
    if global_step is not None:
      n = math_ops.to_float(global_step)
      decay = math_ops.minimum(decay, n / (n + 1.))

    # update averages
    mean = moving_average("mean", log_norm, decay)
    sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay)

    variance = sq_mean - math_ops.square(mean)
    std = math_ops.sqrt(math_ops.maximum(epsilon, variance))
    max_norms = math_ops.exp(mean + std_factor * std)
    return max_norms, mean
Example #9
0
def saturate_cast(image, dtype):
  """Performs a safe cast of image data to `dtype`.

  This function casts the data in image to `dtype`, without applying any
  scaling. If there is a danger that image data would over or underflow in the
  cast, this op applies the appropriate clamping before the cast.

  Args:
    image: An image to cast to a different data type.
    dtype: A `DType` to cast `image` to.

  Returns:
    `image`, safely cast to `dtype`.
  """
  clamped = image

  # When casting to a type with smaller representable range, clamp.
  # Note that this covers casting to unsigned types as well.
  if image.dtype.min < dtype.min and image.dtype.max > dtype.max:
    clamped = clip_ops.clip_by_value(clamped,
                                     math_ops.cast(dtype.min, image.dtype),
                                     math_ops.cast(dtype.max, image.dtype))
  elif image.dtype.min < dtype.min:
    clamped = math_ops.maximum(clamped, math_ops.cast(dtype.min, image.dtype))
  elif image.dtype.max > dtype.max:
    clamped = math_ops.minimum(clamped, math_ops.cast(dtype.max, image.dtype))

  return math_ops.cast(clamped, dtype)
Example #10
0
def _accuracy_baseline(labels_mean):
  """Return accuracy baseline based on labels mean.

  This is the best the model could do by always predicting one class.

  Args:
    labels_mean: Tuple of value and update op.

  Returns:
    Tuple of value and update op.
  """
  with ops.name_scope(None, 'accuracy_baseline', labels_mean):
    value, update_op = labels_mean
    return (
        math_ops.maximum(value, 1. - value, name='value'),
        math_ops.maximum(update_op, 1 - update_op, name='update_op'))
Example #11
0
  def make_inverse_update_ops(self):
    """Create and return update ops corresponding to registered computations."""
    ops = super(InverseProvidingFactor, self).make_inverse_update_ops()

    num_inverses = len(self._inverses_by_damping)
    matrix_power_registered = bool(self._matpower_by_exp_and_damping)
    use_eig = (self._eigendecomp or matrix_power_registered or
               num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)

    if use_eig:
      self.register_eigendecomp()  # ensures self._eigendecomp is set
      eigenvalues, eigenvectors = self._eigendecomp  # pylint: disable=unpacking-non-sequence

      # The matrix self._cov is positive semidefinite by construction, but the
      # numerical eigenvalues could be negative due to numerical errors, so here
      # we clip them to be at least EIGENVALUE_CLIPPING_THRESHOLD.
      clipped_eigenvalues = math_ops.maximum(eigenvalues,
                                             EIGENVALUE_CLIPPING_THRESHOLD)

      for damping, inv in self._inverses_by_damping.items():
        ops.append(
            inv.assign(
                math_ops.matmul(eigenvectors / (clipped_eigenvalues + damping),
                                array_ops.transpose(eigenvectors))))

      for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
        ops.append(
            matpower.assign(
                math_ops.matmul(eigenvectors * (clipped_eigenvalues + damping)**
                                exp, array_ops.transpose(eigenvectors))))
    else:
      for damping, inv in self._inverses_by_damping.items():
        ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))

    return ops
Example #12
0
    def BackwardLoopBody(*args):
      """Backward loop body function."""
      t, dev_t = args[0], args[1]
      (theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state1,
       d_inputs, d_acc_state) = _Pack(args[2:], bakloop_sig)

      # The input recurrent state for time step t is previous time step's
      # output, or the original state0 when on time step 0.
      state_from_acc = _Index(acc_state, math_ops.maximum(0, t - 1))
      state0 = functional_ops.If(
          math_ops.equal(t, array_ops.constant(0, dtypes.int32)),
          _Flatten([state_from_acc, orig_state0]), ReturnOrigState0,
          ReturnAccState)
      state0 = nest.pack_sequence_as(orig_state0, state0)

      # The external inputs for time step t.
      inputs_t = _Index(inputs, t)
      # The extras for time step t.
      extras_t = _Index(acc_extras, t)

      d_state1 = _Add(_Index(d_acc_state, t), d_state1)
      (d_theta_t, d_state0, d_inputs_t) = _Pack(
          Bak(*_Flatten([theta, state0, inputs_t, extras_t, d_state1])),
          [self._theta, self._state, self._inputs])
      d_theta = _Add(d_theta, d_theta_t)
      d_inputs = _Update(d_inputs, d_inputs_t, dev_t)
      return [math_ops.subtract(dev_t, 1)] + _Flatten([
          theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state0,
          d_inputs, d_acc_state
      ])
Example #13
0
def l2_normalize(x, dim, epsilon=1e-12, name=None):
  """Normalizes along dimension `dim` using an L2 norm.

  For a 1-D tensor with `dim = 0`, computes

      output = x / sqrt(max(sum(x**2), epsilon))

  For `x` with more dimensions, independently normalizes each 1-D slice along
  dimension `dim`.

  Args:
    x: A `Tensor`.
    dim: Dimension along which to normalize.
    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
      divisor if `norm < sqrt(epsilon)`.
    name: A name for this operation (optional).

  Returns:
    A `Tensor` with the same shape as `x`.
  """
  with ops.op_scope([x], name, "l2_normalize") as name:
    x = ops.convert_to_tensor(x, name="x")
    square_sum = math_ops.reduce_sum(math_ops.square(x), [dim], keep_dims=True)
    x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
    return math_ops.mul(x, x_inv_norm, name=name)
Example #14
0
  def _setup_sparsity(self):
    begin_step = self._spec.sparsity_function_begin_step
    end_step = self._spec.sparsity_function_end_step
    initial_sparsity = self._spec.initial_sparsity
    target_sparsity = self._spec.target_sparsity
    exponent = self._spec.sparsity_function_exponent

    if begin_step >= end_step:
      raise ValueError(
          'Pruning must begin before it can end. begin_step=%d, end_step=%d' %
          (begin_step, end_step))

    with ops.name_scope(self._spec.name):
      p = math_ops.minimum(1.0,
                           math_ops.maximum(
                               0.0,
                               math_ops.div(
                                   math_ops.cast(self._global_step - begin_step,
                                                 np.float32),
                                   end_step - begin_step)))
      sparsity = math_ops.add(
          math_ops.multiply(initial_sparsity - target_sparsity,
                            math_ops.pow(1 - p, exponent)),
          target_sparsity,
          name='sparsity')

    return sparsity
Example #15
0
    def _testConfMatrixOnTensors(self, tf_dtype, np_dtype):
        with self.test_session() as sess:
            m_neg = array_ops.placeholder(dtype=dtypes.float32)
            m_pos = array_ops.placeholder(dtype=dtypes.float32)
            s = array_ops.placeholder(dtype=dtypes.float32)

            neg = random_ops.random_normal([20], mean=m_neg, stddev=s, dtype=dtypes.float32)
            pos = random_ops.random_normal([20], mean=m_pos, stddev=s, dtype=dtypes.float32)

            data = array_ops.concat([neg, pos], 0)
            data = math_ops.cast(math_ops.round(data), tf_dtype)
            data = math_ops.minimum(math_ops.maximum(data, 0), 1)
            lab = array_ops.concat([array_ops.zeros([20], dtype=tf_dtype), array_ops.ones([20], dtype=tf_dtype)], 0)

            cm = confusion_matrix.confusion_matrix(lab, data, dtype=tf_dtype, num_classes=2)

            d, l, cm_out = sess.run([data, lab, cm], {m_neg: 0.0, m_pos: 1.0, s: 1.0})

            truth = np.zeros([2, 2], dtype=np_dtype)
            try:
                range_builder = xrange
            except NameError:  # In Python 3.
                range_builder = range
            for i in range_builder(len(d)):
                truth[l[i], d[i]] += 1

            self.assertEqual(cm_out.dtype, np_dtype)
            self.assertAllClose(cm_out, truth, atol=1e-10)
Example #16
0
    def _apply_dense(self, grad, var):
        beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking)

        # amsgrad
        vhat = self.get_slot(var, "vhat")
        vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
        v_sqrt = math_ops.sqrt(vhat_t)

        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
Example #17
0
    def _apply_sparse_shared(self, grad, var, indices, scatter_add):
        beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = scatter_add(m, indices, m_scaled_g_values)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = scatter_add(v, indices, v_scaled_g_values)

        # amsgrad
        vhat = self.get_slot(var, "vhat")
        vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
        v_sqrt = math_ops.sqrt(vhat_t)
        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
Example #18
0
  def _resource_apply_sparse(self, grad, var, indices):
    var_dtype = var.dtype.base_dtype
    lr_t = self._decayed_lr(var_dtype)

    beta_1_t = self._get_hyper('beta_1', var_dtype)
    beta_2_t = self._get_hyper('beta_2', var_dtype)
    local_step = math_ops.cast(self.iterations + 1, var_dtype)
    beta_1_power = math_ops.pow(beta_1_t, local_step)
    epsilon_t = self._get_hyper('epsilon', var_dtype)

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, 'm')
    m_slice = array_ops.gather(m, indices)
    m_t_slice = m_slice * beta_1_t + grad * (1 - beta_1_t)
    with ops.control_dependencies([m_t_slice]):
      m_t = self._resource_scatter_update(m, indices, m_t_slice)

    # u_t = max(beta2 * u, abs(g_t))
    v = self.get_slot(var, 'v')
    v_slice = array_ops.gather(v, indices)
    v_t_slice = math_ops.maximum(v_slice * beta_2_t, math_ops.abs(grad))
    with ops.control_dependencies([v_t_slice]):
      v_t = self._resource_scatter_update(v, indices, v_t_slice)
    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
    var_slice = -lr_t / (1 - beta_1_power) * (
        m_t_slice / (v_t_slice + epsilon_t))
    with ops.control_dependencies([var_slice]):
      var_update = self._resource_scatter_add(var, indices, var_slice)
    return control_flow_ops.group(*[var_update, m_t, v_t])
Example #19
0
def _compute_ami_score(labels, predictions):
  ami_score = math_ops.to_float(
      script_ops.py_func(
          metrics.adjusted_mutual_info_score, [labels, predictions],
          [dtypes.float64],
          name='ami'))
  return math_ops.maximum(0.0, ami_score)
Example #20
0
def contrastive_loss(labels, embeddings_anchor, embeddings_positive,
                     margin=1.0):
  """Computes the contrastive loss.

  This loss encourages the embedding to be close to each other for
    the samples of the same label and the embedding to be far apart at least
    by the margin constant for the samples of different labels.
  See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      binary labels indicating positive vs negative pair.
    embeddings_anchor: 2-D float `Tensor` of embedding vectors for the anchor
      images. Embeddings should be l2 normalized.
    embeddings_positive: 2-D float `Tensor` of embedding vectors for the
      positive images. Embeddings should be l2 normalized.
    margin: margin term in the loss definition.

  Returns:
    contrastive_loss: tf.float32 scalar.
  """
  # Get per pair distances
  distances = math_ops.sqrt(
      math_ops.reduce_sum(
          math_ops.square(embeddings_anchor - embeddings_positive), 1))

  # Add contrastive loss for the siamese network.
  #   label here is {0,1} for neg, pos.
  return math_ops.reduce_mean(
      math_ops.to_float(labels) * math_ops.square(distances) +
      (1. - math_ops.to_float(labels)) *
      math_ops.square(math_ops.maximum(margin - distances, 0.)),
      name='contrastive_loss')
Example #21
0
 def _apply_sparse_shared(self, grad, var, indices,
                          scatter_add, scatter_update):
   beta1_power = self._get_beta_accumulators()
   beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
   lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
   beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
   beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
   epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
   # m_t = beta1 * m + (1 - beta1) * g_t
   m = self.get_slot(var, "m")
   m_slice = array_ops.gather(m, indices)
   m_t_slice = m_slice * beta1_t + grad * (1 - beta1_t)
   with ops.control_dependencies([m_t_slice]):
     m_t = scatter_update(m, indices, m_t_slice)
   # u_t = max(beta2 * u, abs(g_t))
   v = self.get_slot(var, "v")
   v_slice = array_ops.gather(v, indices)
   v_t_slice = math_ops.maximum(v_slice * beta2_t, math_ops.abs(grad))
   with ops.control_dependencies([v_t_slice]):
     v_t = scatter_update(v, indices, v_t_slice)
   # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
   var_slice = -lr_t / (1 - beta1_power) * (m_t_slice /
                                            (v_t_slice + epsilon_t))
   with ops.control_dependencies([var_slice]):
     var_update = scatter_add(var, indices, var_slice)
   return control_flow_ops.group(*[var_update, m_t, v_t])
def confusion_matrix(predictions, labels, num_classes=None,
                     dtype=dtypes.int32, name=None):
  """Computes the confusion matrix from predictions and labels.

  Calculate the Confusion Matrix for a pair of prediction and
  label 1-D int arrays.

  Considering a prediction array such as: `[1, 2, 3]`
  And a label array such as: `[2, 2, 3]`

  The confusion matrix returned would be the following one:
      [[0, 0, 0]
       [0, 1, 0]
       [0, 1, 0]
       [0, 0, 1]]

  Where the matrix rows represent the prediction labels and the columns
  represents the real labels. The confusion matrix is always a 2-D array
  of shape [n, n], where n is the number of valid labels for a given
  classification task. Both prediction and labels must be 1-D arrays of
  the same shape in order for this function to work.

  Args:
    predictions: A 1-D array represeting the predictions for a given
                 classification.
    labels: A 1-D represeting the real labels for the classification task.
    num_classes: The possible number of labels the classification task can
                 have. If this value is not provided, it will be calculated
                 using both predictions and labels array.
    dtype: Data type of the confusion matrix.
    name: Scope name.

  Returns:
    A k X k matrix represeting the confusion matrix, where k is the number of
    possible labels in the classification task.

  Raises:
    ValueError: If both predictions and labels are not 1-D vectors and do not
                have the same size.
  """
  with ops.name_scope(name, 'confusion_matrix',
                      [predictions, labels, num_classes]) as name:
    predictions, labels = metric_ops_util.remove_squeezable_dimensions(
        ops.convert_to_tensor(
            predictions, name='predictions', dtype=dtypes.int64),
        ops.convert_to_tensor(labels, name='labels', dtype=dtypes.int64))

    if num_classes is None:
      num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
                                     math_ops.reduce_max(labels)) + 1

    shape = array_ops.pack([num_classes, num_classes])
    indices = array_ops.transpose(array_ops.pack([predictions, labels]))
    values = array_ops.ones_like(predictions, dtype)
    cm_sparse = ops.SparseTensor(
        indices=indices, values=values, shape=shape)
    zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)

    return sparse_ops.sparse_add(zero_matrix, cm_sparse)
  def compress(self, inputs):
    """Compress inputs and store their binary representations into strings.

    Args:
      inputs: `Tensor` with values to be compressed.

    Returns:
      String `Tensor` vector containing the compressed representation of each
      batch element of `inputs`.
    """
    with ops.name_scope(self._name_scope()):
      inputs = ops.convert_to_tensor(inputs)
      if not self.built:
        # Check input assumptions set before layer building, e.g. input rank.
        self._assert_input_compatibility(inputs)
        if self.dtype is None:
          self._dtype = inputs.dtype.base_dtype.name
        self.build(inputs.shape)

      # Check input assumptions set after layer building, e.g. input shape.
      if not context.executing_eagerly():
        self._assert_input_compatibility(inputs)

      ndim = self.input_spec.ndim
      channel_axis = self._channel_axis(ndim)
      # Tuple of slices for expanding dimensions of tensors below.
      slices = ndim * [None] + [slice(None)]
      slices[channel_axis] = slice(None)
      slices = tuple(slices)

      # Expand dimensions of CDF to input dimensions, keeping the channels along
      # the right dimension.
      cdf = self._quantized_cdf[slices[1:]]
      num_levels = array_ops.shape(cdf)[-1] - 1

      # Bring inputs to the right range by centering the range on the medians.
      half = constant_op.constant(.5, dtype=self.dtype)
      medians = array_ops.squeeze(self._medians, [1, 2])
      offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians
      # Expand offsets to input dimensions and add to inputs.
      values = inputs + offsets[slices[:-1]]

      # Clip to range and cast to integers. Because we have added .5 above, and
      # all values are positive, the cast effectively implements rounding.
      values = math_ops.maximum(values, half)
      values = math_ops.minimum(
          values, math_ops.cast(num_levels, self.dtype) - half)
      values = math_ops.cast(values, dtypes.int16)

      def loop_body(tensor):
        return coder_ops.range_encode(
            tensor, cdf, precision=self.range_coder_precision)
      strings = functional_ops.map_fn(
          loop_body, values, dtype=dtypes.string, back_prop=False)

      if not context.executing_eagerly():
        strings.set_shape(inputs.shape[:1])

      return strings
Example #24
0
    def update_if_not_finite_grads():
      """Update assuming the gradients are nonfinite."""

      new_loss_scale = math_ops.maximum(
          self._current_loss_scale / self._multiplier, 1)
      return control_flow_ops.group(
          self._num_good_steps.assign(0),
          self._current_loss_scale.assign(new_loss_scale))
def _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length, is_reverse=False):
  """Pads `input_tensor` to `fft_length` on its inner-most `fft_rank` dims."""
  fft_shape = _tensor_util.constant_value_as_shape(fft_length)

  # Edge case: skip padding empty tensors.
  if (input_tensor.shape.ndims is not None and
      any(dim.value == 0 for dim in input_tensor.shape.dims)):
    return input_tensor

  # If we know the shapes ahead of time, we can either skip or pre-compute the
  # appropriate paddings. Otherwise, fall back to computing paddings in
  # TensorFlow.
  if fft_shape.is_fully_defined() and input_tensor.shape.ndims is not None:
    # Slice the last FFT-rank dimensions from input_tensor's shape.
    input_fft_shape = input_tensor.shape[-fft_shape.ndims:]

    if input_fft_shape.is_fully_defined():
      # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
      if is_reverse:
        fft_shape = fft_shape[:-1].concatenate(
            fft_shape.dims[-1].value // 2 + 1)

      paddings = [[0, max(fft_dim.value - input_dim.value, 0)]
                  for fft_dim, input_dim in zip(
                      fft_shape.dims, input_fft_shape.dims)]
      if any(pad > 0 for _, pad in paddings):
        outer_paddings = [[0, 0]] * max((input_tensor.shape.ndims -
                                         fft_shape.ndims), 0)
        return _array_ops.pad(input_tensor, outer_paddings + paddings)
      return input_tensor

  # If we can't determine the paddings ahead of time, then we have to pad. If
  # the paddings end up as zero, tf.pad has a special-case that does no work.
  input_rank = _array_ops.rank(input_tensor)
  input_fft_shape = _array_ops.shape(input_tensor)[-fft_rank:]
  outer_dims = _math_ops.maximum(0, input_rank - fft_rank)
  outer_paddings = _array_ops.zeros([outer_dims], fft_length.dtype)
  # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
  if is_reverse:
    fft_length = _array_ops.concat([fft_length[:-1],
                                    fft_length[-1:] // 2 + 1], 0)
  fft_paddings = _math_ops.maximum(0, fft_length - input_fft_shape)
  paddings = _array_ops.concat([outer_paddings, fft_paddings], 0)
  paddings = _array_ops.stack([_array_ops.zeros_like(paddings), paddings],
                              axis=1)
  return _array_ops.pad(input_tensor, paddings)
Example #26
0
def _compute_ari_score(labels, predictions):
  ari_score = math_ops.to_float(
      script_ops.py_func(
          metrics.adjusted_rand_score, [labels, predictions], [dtypes.float64],
          name='ari'))
  # ari score can go below 0
  # http://scikit-learn.org/stable/modules/clustering.html#adjusted-rand-score
  return math_ops.maximum(0.0, ari_score)
Example #27
0
 def linear_decay_fn(global_step):
   if global_step is None:
     raise ValueError("global_step is required for linear_decay.")
   global_step = math_ops.minimum(global_step, decay_steps)
   remaining_steps = math_ops.to_int32(decay_steps) - math_ops.to_int32(
       global_step)
   decayed = math_ops.to_float(remaining_steps) / math_ops.to_float(
       decay_steps)
   return math_ops.maximum(0.0, decayed)
Example #28
0
 def _compare(self, x, y, use_gpu):
   np_min, np_max = np.minimum(x, y), np.maximum(x, y)
   with test_util.device(use_gpu=use_gpu):
     inx = ops.convert_to_tensor(x)
     iny = ops.convert_to_tensor(y)
     omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
     tf_min, tf_max = self.evaluate([omin, omax])
   self.assertAllEqual(np_min, tf_min)
   self.assertAllEqual(np_max, tf_max)
Example #29
0
  def inference_graph(self, input_data, **inference_args):
    """Constructs a TF graph for evaluating a random forest.

    Args:
      input_data: A tensor or dict of string->Tensor for the input data.
                  This input_data must generate the same spec as the
                  input_data used in training_graph:  the dict must have
                  the same keys, for example, and all tensors must have
                  the same size in their first dimension.
      **inference_args: Keyword arguments to pass through to each tree.

    Returns:
      A tuple of (probabilities, tree_paths, variance), where variance
      is the variance over all the trees for regression problems only.

    Raises:
      NotImplementedError: If trying to use feature bagging with sparse
        features.
    """
    processed_dense_features, processed_sparse_features, data_spec = (
        data_ops.ParseDataTensorOrDict(input_data))

    probabilities = []
    paths = []
    for i in range(self.params.num_trees):
      with ops.device(self.variables.device_dummies[i].device):
        tree_data = processed_dense_features
        if self.params.bagged_features:
          if processed_sparse_features is not None:
            raise NotImplementedError(
                'Feature bagging not supported with sparse features.')
          tree_data = self._bag_features(i, tree_data)
        probs, path = self.trees[i].inference_graph(
            tree_data,
            data_spec,
            sparse_features=processed_sparse_features,
            **inference_args)
        probabilities.append(probs)
        paths.append(path)
    with ops.device(self.variables.device_dummies[0].device):
      # shape of all_predict should be [batch_size, num_trees, num_outputs]
      all_predict = array_ops.stack(probabilities, axis=1)
      average_values = math_ops.div(
          math_ops.reduce_sum(all_predict, 1),
          self.params.num_trees,
          name='probabilities')
      tree_paths = array_ops.stack(paths, axis=1)
      regression_variance = None
      if self.params.regression:
        expected_squares = math_ops.div(
            math_ops.reduce_sum(all_predict * all_predict, 1),
            self.params.num_trees)
        regression_variance = math_ops.maximum(
            0., expected_squares - average_values * average_values)
      return average_values, tree_paths, regression_variance
Example #30
0
def clip_by_norm(t, clip_norm, axes=None, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
  along the dimensions given in `axes`. Specifically, in the default case
  where all dimensions are used for calculation, if the L2-norm of `t` is
  already less than or equal to `clip_norm`, then `t` is not modified. If
  the L2-norm is greater than `clip_norm`, then this operation returns a
  tensor of the same type and shape as `t` with its values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  As another example, if `t` is a matrix and `axes == [1]`, then each row
  of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
  instead, each column of the output will be clipped.

  This operation is typically used to clip gradients before applying them with
  an optimizer.

  Args:
    t: A `Tensor` or `IndexedSlices`.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
      to use for computing the L2-norm. If `None` (the default), uses all
      dimensions.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.
  """
  with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
    values = ops.convert_to_tensor(
        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
    pred = l2sum > 0
    # Two-tap tf.where trick to bypass NaN gradients
    l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
    l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
    intermediate = values * clip_norm
    # Assert that the shape is compatible with the initial shape,
    # to prevent unintentional broadcasting.
    _ = values.shape.merge_with(intermediate.shape)
    values_clip = array_ops.identity(
        intermediate / math_ops.maximum(l2norm, clip_norm), name=name)

    if isinstance(t, ops.IndexedSlices):
      return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)

    return values_clip
Example #31
0
def triplet_semihard_loss(labels, embeddings, margin=1.0):
    """Computes the triplet loss with semi-hard negative mining.
      The loss encourages the positive distances (between a pair of embeddings with
      the same labels) to be smaller than the minimum negative distance among
      which are at least greater than the positive distance plus the margin constant
      (called semi-hard negative) in the mini-batch. If no such negative exists,
      uses the largest negative distance instead.
      See: https://arxiv.org/abs/1503.03832.
      Args:
        labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
          multiclass integer labels.
        embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
          be l2 normalized.
        margin: Float, margin term in the loss definition.
      Returns:
        triplet_loss: tf.float32 scalar.
    """
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = pairwise_distance(embeddings, squared=True)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
        math_ops.greater(
            pdist_matrix_tile,
            array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
        math_ops.greater(
            math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32),
                                1,
                                keep_dims=True), 0.0),
        [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(mask_final, negatives_outside,
                                          negatives_inside)

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    _triplet_loss = math_ops.truediv(math_ops.reduce_sum(
        math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)),
                                     num_positives,
                                     name='triplet_semihard_loss')
    return _triplet_loss
Example #32
0
def _dynamic_rnn_time_input_loop(cell,
                                 inputs,
                                 initial_state,
                                 parallel_iterations,
                                 swap_memory,
                                 sequence_length=None,
                                 dtype=None):
    """Internal implementation of Dynamic RNN, add current time to input of cell.
  Args:
    cell: An instance of RNNCell.
    inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested
      tuple of such elements.
    initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if
      `cell.state_size` is a tuple, then this should be a tuple of
      tensors having shapes `[batch_size, s] for s in cell.state_size`.
    parallel_iterations: Positive Python int.
    swap_memory: A Python boolean
    sequence_length: (optional) An `int32` `Tensor` of shape [batch_size].
    dtype: (optional) Expected dtype of output. If not specified, inferred from
      initial_state.
  Returns:
    Tuple `(final_outputs, final_state)`.
    final_outputs:
      A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
      `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
      objects, then this returns a (possibly nested) tuple of Tensors matching
      the corresponding shapes.
    final_state:
      A `Tensor`, or possibly nested tuple of Tensors, matching in length
      and shapes to `initial_state`.
  Raises:
    ValueError: If the input depth cannot be inferred via shape inference
      from the inputs.
  """
    state = initial_state
    assert isinstance(parallel_iterations,
                      int), "parallel_iterations must be int"

    state_size = cell.state_size

    flat_input = nest.flatten(inputs)
    flat_output_size = nest.flatten(cell.output_size)

    # Construct an initial output
    input_shape = array_ops.shape(flat_input[0])
    time_steps = input_shape[0]
    batch_size = rnn._best_effort_input_batch_size(flat_input)

    inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3)
                             for input_ in flat_input)

    const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2]

    for shape in inputs_got_shape:
        if not shape[2:].is_fully_defined():
            raise ValueError(
                "Input size (depth of inputs) must be accessible via shape inference,"
                " but saw value None.")
        got_time_steps = shape[0].value
        got_batch_size = shape[1].value
        if const_time_steps != got_time_steps:
            raise ValueError(
                "Time steps is not the same for all the elements in the input in a "
                "batch.")
        if const_batch_size != got_batch_size:
            raise ValueError(
                "Batch_size is not the same for all the elements in the input."
            )

    # Prepare dynamic conditional copying of state & output
    def _create_zero_arrays(size):
        size = _concat(batch_size, size)
        return array_ops.zeros(array_ops.stack(size),
                               rnn._infer_state_dtype(dtype, state))

    flat_zero_output = tuple(
        _create_zero_arrays(output) for output in flat_output_size)
    zero_output = nest.pack_sequence_as(structure=cell.output_size,
                                        flat_sequence=flat_zero_output)

    if sequence_length is not None:
        min_sequence_length = math_ops.reduce_min(sequence_length)
        max_sequence_length = math_ops.reduce_max(sequence_length)
    else:
        max_sequence_length = time_steps

    time = array_ops.constant(0, dtype=dtypes.int32, name="time")

    with ops.name_scope("dynamic_rnn") as scope:
        base_name = scope

    def _create_ta(name, element_shape, dtype):
        return tensor_array_ops.TensorArray(dtype=dtype,
                                            size=time_steps,
                                            element_shape=element_shape,
                                            tensor_array_name=base_name + name)

    in_graph_mode = not context.executing_eagerly()
    if in_graph_mode:
        output_ta = tuple(
            _create_ta("output_%d" % i,
                       element_shape=(tensor_shape.TensorShape(
                           [const_batch_size]).concatenate(
                               rnn._maybe_tensor_shape_from_tensor(out_size))),
                       dtype=rnn._infer_state_dtype(dtype, state))
            for i, out_size in enumerate(flat_output_size))
        input_ta = tuple(
            _create_ta("input_%d" % i,
                       element_shape=flat_input_i.shape[1:],
                       dtype=flat_input_i.dtype)
            for i, flat_input_i in enumerate(flat_input))
        input_ta = tuple(
            ta.unstack(input_) for ta, input_ in zip(input_ta, flat_input))
    else:
        output_ta = tuple([0 for _ in range(time_steps.numpy())]
                          for i in range(len(flat_output_size)))
        input_ta = flat_input

    def _time_step(time, output_ta_t, state):
        """Take a time step of the dynamic RNN.
    Args:
      time: int32 scalar Tensor.
      output_ta_t: List of `TensorArray`s that represent the output.
      state: nested tuple of vector tensors that represent the state.
    Returns:
      The tuple (time + 1, output_ta_t with updated flow, new_state).
    """

        if in_graph_mode:
            input_t = tuple(ta.read(time) for ta in input_ta)
            # Restore some shape information
            for input_, shape in zip(input_t, inputs_got_shape):
                input_.set_shape(shape[1:])
        else:
            input_t = tuple(ta[time.numpy()] for ta in input_ta)

        input_t = nest.pack_sequence_as(structure=inputs,
                                        flat_sequence=input_t)
        #Here, we make the change to add 'time' as input when calling the cell.
        call_cell = lambda: cell(input_t, state, time)

        if sequence_length is not None:
            (output, new_state) = rnn._rnn_step(
                time=time,
                sequence_length=sequence_length,
                min_sequence_length=min_sequence_length,
                max_sequence_length=max_sequence_length,
                zero_output=zero_output,
                state=state,
                call_cell=call_cell,
                state_size=state_size,
                skip_conditionals=True)
        else:
            (output, new_state) = call_cell()

        # Pack state if using state tuples
        output = nest.flatten(output)

        if in_graph_mode:
            output_ta_t = tuple(
                ta.write(time, out) for ta, out in zip(output_ta_t, output))
        else:
            for ta, out in zip(output_ta_t, output):
                ta[time.numpy()] = out

        return (time + 1, output_ta_t, new_state)

    if in_graph_mode:
        # Make sure that we run at least 1 step, if necessary, to ensure
        # the TensorArrays pick up the dynamic shape.
        loop_bound = math_ops.minimum(time_steps,
                                      math_ops.maximum(1, max_sequence_length))
    else:
        # Using max_sequence_length isn't currently supported in the Eager branch.
        loop_bound = time_steps

    _, output_final_ta, final_state = control_flow_ops.while_loop(
        cond=lambda time, *_: time < loop_bound,
        body=_time_step,
        loop_vars=(time, output_ta, state),
        parallel_iterations=parallel_iterations,
        maximum_iterations=time_steps,
        swap_memory=swap_memory)

    # Unpack final output if not using output tuples.
    if in_graph_mode:
        final_outputs = tuple(ta.stack() for ta in output_final_ta)
        # Restore some shape information
        for output, output_size in zip(final_outputs, flat_output_size):
            shape = _concat([const_time_steps, const_batch_size],
                            output_size,
                            static=True)
            output.set_shape(shape)
    else:
        final_outputs = output_final_ta

    final_outputs = nest.pack_sequence_as(structure=cell.output_size,
                                          flat_sequence=final_outputs)
    if not in_graph_mode:
        final_outputs = nest.map_structure_up_to(
            cell.output_size, lambda x: array_ops.stack(x, axis=0),
            final_outputs)

    return (final_outputs, final_state)
Example #33
0
def squared_hinge(y_true, y_pred):
    return K.mean(math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)),
                  axis=-1)
Example #34
0
    def _validate_sample_arg(self, x):
        """Helper which validates sample arg, e.g., input to `log_prob`."""
        with ops.name_scope(name="validate_sample_arg", values=[x]):
            x_ndims = (array_ops.rank(x)
                       if x.shape.ndims is None else x.shape.ndims)
            event_ndims = (array_ops.size(self.event_shape_tensor())
                           if self.event_shape.ndims is None else
                           self.event_shape.ndims)
            batch_ndims = (array_ops.size(self._batch_shape_unexpanded)
                           if self.batch_shape.ndims is None else
                           self.batch_shape.ndims)
            expected_batch_event_ndims = batch_ndims + event_ndims

            if (isinstance(x_ndims, int)
                    and isinstance(expected_batch_event_ndims, int)):
                if x_ndims < expected_batch_event_ndims:
                    raise NotImplementedError(
                        "Broadcasting is not supported; too few batch and event dims "
                        "(expected at least {}, saw {}).".format(
                            expected_batch_event_ndims, x_ndims))
                ndims_assertion = []
            elif self.validate_args:
                ndims_assertion = [
                    check_ops.assert_greater_equal(
                        x_ndims,
                        expected_batch_event_ndims,
                        message=("Broadcasting is not supported; too few "
                                 "batch and event dims."),
                        name="assert_batch_and_event_ndims_large_enough"),
                ]

            if (self.batch_shape.is_fully_defined()
                    and self.event_shape.is_fully_defined()):
                expected_batch_event_shape = np.int32(
                    self.batch_shape.concatenate(self.event_shape).as_list())
            else:
                expected_batch_event_shape = array_ops.concat([
                    self.batch_shape_tensor(),
                    self.event_shape_tensor(),
                ],
                                                              axis=0)

            sample_ndims = x_ndims - expected_batch_event_ndims
            if isinstance(sample_ndims, int):
                sample_ndims = max(sample_ndims, 0)
            if (isinstance(sample_ndims, int)
                    and x.shape[sample_ndims:].is_fully_defined()):
                actual_batch_event_shape = np.int32(
                    x.shape[sample_ndims:].as_list())
            else:
                sample_ndims = math_ops.maximum(sample_ndims, 0)
                actual_batch_event_shape = array_ops.shape(x)[sample_ndims:]

            if (isinstance(expected_batch_event_shape, np.ndarray)
                    and isinstance(actual_batch_event_shape, np.ndarray)):
                if any(expected_batch_event_shape != actual_batch_event_shape):
                    raise NotImplementedError(
                        "Broadcasting is not supported; "
                        "unexpected batch and event shape "
                        "(expected {}, saw {}).".format(
                            expected_batch_event_shape,
                            actual_batch_event_shape))
                # We need to set the final runtime-assertions to `ndims_assertion` since
                # its possible this assertion was created. We could add a condition to
                # only do so if `self.validate_args == True`, however this is redundant
                # as `ndims_assertion` already encodes this information.
                runtime_assertions = ndims_assertion
            elif self.validate_args:
                # We need to make the `ndims_assertion` a control dep because otherwise
                # TF itself might raise an exception owing to this assertion being
                # ill-defined, ie, one cannot even compare different rank Tensors.
                with ops.control_dependencies(ndims_assertion):
                    shape_assertion = check_ops.assert_equal(
                        expected_batch_event_shape,
                        actual_batch_event_shape,
                        message=("Broadcasting is not supported; "
                                 "unexpected batch and event shape."),
                        name="assert_batch_and_event_shape_same")
                runtime_assertions = [shape_assertion]
            else:
                runtime_assertions = []

            return runtime_assertions
Example #35
0
def _embedding_lookup_and_transform(params,
                                    ids,
                                    partition_strategy="mod",
                                    name=None,
                                    max_norm=None,
                                    transform_fn=None):
    """Helper function for embedding_lookup and _compute_sampled_logits.

  This function is a generalization of embedding_lookup that optionally
  applies a caller-specified transformation to each embedding. This is
  done through the `transform_fn` argument. If provided, the function is
  applied to each partitioned tensor of retrieved embeddings, colocated
  with the embeddings. This function will be called with a single `Tensor`
  argument of the same type as the `params` tensor and should return a
  `Tensor`. The shape of the argument will be the same as `params` except
  for the size of the first dimension. The first dimension of the result's
  shape must be the same size as the argument's.

  Args:
    params: See embedding_lookup.
    ids: See embedding_lookup.
    partition_strategy: See embedding_lookup.
    name: See embedding_lookup.
    max_norm: See embedding_lookup.
    transform_fn: An optional function to apply to each retrieved embedding.
      If max_norm is provided, transform_fn is applied to the norm-limited
      embeddings.

  Returns:
    See embedding_lookup for details.
  Raises:
    ValueError: If `params` is empty.
  """
    if params is None or params in ((), []):
        raise ValueError("Need at least one param")
    if isinstance(params, variables.PartitionedVariable):
        params = list(params)  # Iterate to get the underlying Variables.
    if not isinstance(params, list):
        params = [params]

    with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
        np = len(params)  # Number of partitions
        # Preserve the resource variable status to avoid accidental dense reads.
        if not any(
                isinstance(p, resource_variable_ops.ResourceVariable)
                for p in params):
            params = ops.convert_n_to_tensor_or_indexed_slices(params,
                                                               name="params")
        ids = ops.convert_to_tensor(ids, name="ids")
        if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
            with ops.colocate_with(params[0]):
                result = _clip(_gather(params[0], ids, name=name), ids,
                               max_norm)
                if transform_fn:
                    result = transform_fn(result)
                return result
        else:
            # Flatten the ids. There are two cases where we need to do this.
            # - There is more than one params tensor.
            # - There is a transform_fn and ids is not statically known to be 1-D.
            #   We must flatten in this case because transform_fn expects a flat
            #   tensor of embeddings.
            flat_ids = array_ops.reshape(ids, [-1])
            original_indices = math_ops.range(array_ops.size(flat_ids))

            # Create p_assignments and set new_ids depending on the strategy.
            if partition_strategy == "mod":
                p_assignments = flat_ids % np
                new_ids = flat_ids // np
            elif partition_strategy == "div":
                # Compute num_total_ids as the sum of dim-0 of params, then assign to
                # partitions based on a constant number of ids per partition. Optimize
                # if we already know the full shape statically.
                dim_0_size = params[0].get_shape()[0]
                for p in xrange(1, np):
                    dim_0_size += params[p].get_shape()[0]
                if dim_0_size.value:
                    num_total_ids = constant_op.constant(
                        dim_0_size.value, flat_ids.dtype)
                else:
                    dim_0_sizes = []
                    for p in xrange(np):
                        if params[p].get_shape()[0].value is not None:
                            dim_0_sizes.append(params[p].get_shape()[0].value)
                        else:
                            with ops.colocate_with(params[p]):
                                dim_0_sizes.append(
                                    array_ops.shape(params[p])[0])
                    num_total_ids = math_ops.reduce_sum(
                        math_ops.cast(array_ops.stack(dim_0_sizes),
                                      flat_ids.dtype))
                ids_per_partition = num_total_ids // np
                extras = num_total_ids % np

                p_assignments = math_ops.maximum(
                    flat_ids // (ids_per_partition + 1),
                    (flat_ids - extras) // ids_per_partition)

                # Emulate a conditional using a boolean indicator tensor
                is_in_first_extras_partitions = math_ops.cast(
                    p_assignments < extras, flat_ids.dtype)
                new_ids = (is_in_first_extras_partitions *
                           (flat_ids % (ids_per_partition + 1)) +
                           (1 - is_in_first_extras_partitions) *
                           ((flat_ids - extras) % ids_per_partition))
            else:
                raise ValueError("Unrecognized partition strategy: " +
                                 partition_strategy)

            # Cast partition assignments to int32 for use in dynamic_partition.
            # There really should not be more than 2^32 partitions.
            p_assignments = math_ops.cast(p_assignments, dtypes.int32)
            # Partition list of ids based on assignments into np separate lists
            gather_ids = data_flow_ops.dynamic_partition(
                new_ids, p_assignments, np)
            # Similarly, partition the original indices.
            pindices = data_flow_ops.dynamic_partition(original_indices,
                                                       p_assignments, np)
            # Do np separate lookups, finding embeddings for plist[p] in params[p]
            partitioned_result = []
            for p in xrange(np):
                pids = gather_ids[p]
                with ops.colocate_with(params[p]):
                    result = _gather(params[p], pids)
                    if transform_fn:
                        # If transform_fn is provided, the clip_by_norm precedes
                        # the transform and hence must be co-located. See below
                        # for the counterpart if transform_fn is not proveded.
                        result = transform_fn(_clip(result, pids, max_norm))
                partitioned_result.append(result)
            # Stitch these back together
            ret = data_flow_ops.parallel_dynamic_stitch(pindices,
                                                        partitioned_result,
                                                        name=name)

            # Determine the static element shape.
            if transform_fn is None:
                element_shape_s = params[0].get_shape()[1:]
                for p in params[1:]:
                    element_shape_s = element_shape_s.merge_with(
                        p.get_shape()[1:])
            else:
                element_shape_s = ret.get_shape()[1:]

            # Compute the dynamic element shape.
            if element_shape_s.is_fully_defined():
                element_shape_d = element_shape_s
            elif transform_fn is None:
                # It's important that we compute params[0].shape on the right device
                # to avoid data motion.
                with ops.colocate_with(params[0]):
                    params_shape = array_ops.shape(params[0])
                element_shape_d = params_shape[1:]
            else:
                element_shape_d = array_ops.shape(ret)[1:]

            # Reshape to reverse the flattening of ids.
            ret = array_ops.reshape(
                ret,
                array_ops.concat([array_ops.shape(ids), element_shape_d], 0))

            # Normally the reshape is sufficient, but setting shape explicitly
            # teaches shape inference that params[1:].get_shape() matters
            # (in the case that transform_fn is None).
            ret.set_shape(ids.get_shape().concatenate(element_shape_s))
            if not transform_fn:
                # If transform_fn was provided, the clip_by_norm was done above.
                ret = _clip(ret, ids, max_norm)
            return ret
Example #36
0
 def map_negative_offset(offset):
     return math_ops.maximum(limits + offset, starts)
  def _kmc2_multiple_centers(self):
    """Adds new initial cluster centers using the k-MC2 algorithm.

    In each call to the op, the provided batch is split into subsets based on
    the specified `kmc2_chain_length`. On each subset, a single Markov chain of
    the k-MC2 algorithm is used to add *one* new center cluster center. If there
    are less than `kmc2_chain_length` points in the subset, a single center is
    added using one Markov chain on the full input. It is assumed that the
    provided batch has previously been randomly permuted. Otherwise, k-MC2 may
    return suboptimal centers.

    Returns:
      An op that adds new cluster centers.
    """
    # The op only operates on the first shard of data.
    first_shard = self._inputs[0]
    # Number of points in the input that can be used.
    batch_size = array_ops.shape(first_shard)[0]
    # Maximum number of subsets such that the size of each subset is at least
    # `kmc2_chain_length`. Final subsets may be larger.
    max_to_sample = math_ops.cast(
        batch_size / self._kmc2_chain_length, dtype=dtypes.int32)
    # We sample at least one new center and at most all remaining centers.
    num_to_sample = math_ops.maximum(
        math_ops.minimum(self._num_remaining, max_to_sample), 1)

    def _cond(i, _):
      """Stopping condition for the while loop."""
      return math_ops.less(i, num_to_sample)

    def _body(i, _):
      """Body that adds a single new center based on a subset."""

      def _sample_random():
        """Returns a random point as a cluster center."""
        # By assumption the batch is reshuffled and _sample_random is always
        # called for i=0. Hence, we simply return the first point.
        new_center = array_ops.reshape(first_shard[0], [1, -1])
        if self._distance_metric == COSINE_DISTANCE:
          new_center = nn_impl.l2_normalize(new_center, dim=1)
        return new_center

      def _sample_kmc2_chain():
        """Returns previous centers as well as a new center sampled using k-MC2.
        """
        # Extract the subset from the underlying batch.
        start = i * self._kmc2_chain_length
        end = start + self._kmc2_chain_length
        subset = first_shard[start:end]
        # Compute the distances from points in the subset to previous centers.
        _, distances = gen_clustering_ops.nearest_neighbors(
            subset, self._cluster_centers, 1)
        # Sample index of new center using k-MC2 Markov chain.
        new_center_index = gen_clustering_ops.kmc2_chain_initialization(
            array_ops.squeeze(distances), self._random_seed)
        # Extract actual new center.
        newly_sampled_center = array_ops.reshape(subset[new_center_index],
                                                 [1, -1])
        # Return concatenation with previously sampled centers.
        if self._distance_metric == COSINE_DISTANCE:
          newly_sampled_center = nn_impl.l2_normalize(
              newly_sampled_center, dim=1)
        return array_ops.concat([self._cluster_centers, newly_sampled_center],
                                0)

      # Obtain a random point if there are no previously sampled centers.
      # Otherwise, construct a k-MC2 Markov chain.
      new_centers = control_flow_ops.cond(
          math_ops.equal(self._num_selected, 0), _sample_random,
          _sample_kmc2_chain)
      # Assign new cluster centers to underlying variable.
      assigned_centers = state_ops.assign(
          self._cluster_centers, new_centers, validate_shape=False)
      if self._cluster_centers_updated is not self._cluster_centers:
        assigned_centers = state_ops.assign(
            self._cluster_centers_updated,
            assigned_centers,
            validate_shape=False)
      return i + 1, self._num_clusters - array_ops.shape(assigned_centers)[0]

    # Add num_to_sample new data points.
    _, num_remaining = control_flow_ops.while_loop(_cond, _body, [0, 0])
    return num_remaining
def cluster_loss(
    labels,
    embeddings,
    margin_multiplier,
    enable_pam_finetuning=True,
    margin_type='nmi',
    print_losses=False,
):
    """Computes the clustering loss.

    The following structured margins are supported:
      nmi: normalized mutual information
      ami: adjusted mutual information
      ari: adjusted random index
      vmeasure: v-measure
      const: indicator checking whether the two clusterings are the same.

    Args:
      labels: 2-D Tensor of labels of shape [batch size, 1]
      embeddings: 2-D Tensor of embeddings of shape
        [batch size, embedding dimension]. Embeddings should be l2 normalized.
      margin_multiplier: float32 scalar. multiplier on the structured margin term
        See section 3.2 of paper for discussion.
      enable_pam_finetuning: Boolean, Whether to run local pam refinement.
        See section 3.4 of paper for discussion.
      margin_type: Type of structured margin to use. See section 3.2 of
        paper for discussion. Can be 'nmi', 'ami', 'ari', 'vmeasure', 'const'.
      print_losses: Boolean. Option to print the loss.

    Paper: https://arxiv.org/abs/1612.01213.

    Returns:
      clustering_loss: A float32 scalar `Tensor`.
    Raises:
      ImportError: If sklearn dependency is not installed.
    """
    if not HAS_SKLEARN:
        raise ImportError('Cluster loss depends on sklearn.')
    pairwise_distances = pairwise_distance(embeddings)
    labels = array_ops.squeeze(labels)
    all_ids = math_ops.range(array_ops.shape(embeddings)[0])

    # Compute the loss augmented inference and get the cluster centroids.
    chosen_ids = compute_augmented_facility_locations(pairwise_distances,
                                                      labels, all_ids,
                                                      margin_multiplier,
                                                      margin_type)
    # Given the predicted centroids, compute the clustering score.
    score_pred = compute_facility_energy(pairwise_distances, chosen_ids)

    # Branch whether to use PAM finetuning.
    if enable_pam_finetuning:
        # Initialize with augmented facility solution.
        chosen_ids = compute_augmented_facility_locations_pam(
            pairwise_distances, labels, margin_multiplier, margin_type,
            chosen_ids)
        score_pred = compute_facility_energy(pairwise_distances, chosen_ids)

    # Given the predicted centroids, compute the cluster assignments.
    predictions = get_cluster_assignment(pairwise_distances, chosen_ids)

    # Compute the clustering (i.e. NMI) score between the two assignments.
    clustering_score_pred = compute_clustering_score(labels, predictions,
                                                     margin_type)

    # Compute the clustering score from labels.
    score_gt = compute_gt_cluster_score(pairwise_distances, labels)

    # Compute the hinge loss.
    clustering_loss = math_ops.maximum(
        score_pred + margin_multiplier * (1.0 - clustering_score_pred) -
        score_gt,
        0.0,
        name='clustering_loss',
    )
    clustering_loss.set_shape([])

    if print_losses:
        clustering_loss = logging_ops.Print(
            clustering_loss,
            [
                'clustering_loss: ', clustering_loss,
                array_ops.shape(clustering_loss)
            ],
        )

    # Clustering specific summary.
    summary.scalar('losses/score_pred', score_pred)
    summary.scalar('losses/' + margin_type, clustering_score_pred)
    summary.scalar('losses/score_gt', score_gt)

    return clustering_loss
Example #39
0
def _safe_shape_div(x, y):
    """Divides `x / y` assuming `x, y >= 0`, treating `0 / 0 = 0`."""
    return x // math_ops.maximum(y, 1)
Example #40
0
def LastValueQuantize(inputs,
                      per_channel=False,
                      init_min=-6.0,
                      init_max=6.0,
                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                      vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                      scope=None,
                      reuse=None,
                      is_training=True,
                      num_bits=8,
                      narrow_range=False):
    """Adds a layer that collects quantization ranges as last input ranges.
  LastValueQuantize creates variables called 'min' and 'max', representing the
  interval used for quantization and clamping.
  Args:
    inputs: a tensor containing values to be quantized.
    per_channel: (Optional) a boolean specifying whether to use different
      quantization ranges per output channel.
    init_min: a float scalar, the initial value for variable min.
    init_max: a float scalar, the initial value for variable max.
    updates_collection: (Optional) collections to collect the update ops for
      computation.
    vars_collection: (Optional) collection where to store variables for
      quantization interval ends.
    scope: Optional scope for variable_scope.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.
    is_training: Whether the op is applied to a training or eval graph.
    num_bits: Number of bits to use for quantization, must be between 2 and 8.
    narrow_range: Whether to use the narrow quantization range
      [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1].
  Returns:
    a tensor containing quantized values.
  """
    with variable_scope.variable_scope(scope,
                                       'LastValueQuantize',
                                       values=[inputs],
                                       reuse=reuse):
        input_shape = inputs.get_shape()
        input_dim = len(input_shape)
        if per_channel:
            # Only support quantizing 1-, 2- and 4-dimensional tensors.
            assert input_dim in [1, 2, 4
                                 ], ('Expected 1D, 2D or 4D input, was: %s in '
                                     ' scope: %s' % (input_shape, scope))
            min_max_shape = [input_shape[-1]]
        else:
            min_max_shape = []

        min_var = model_variable(
            'min',
            shape=min_max_shape,
            initializer=init_ops.constant_initializer(init_min),
            collections=[vars_collection],
            trainable=False)
        max_var = model_variable(
            'max',
            shape=min_max_shape,
            initializer=init_ops.constant_initializer(init_max),
            collections=[vars_collection],
            trainable=False)
        if not is_training:
            return _FakeQuantWithMinMaxVars(inputs,
                                            min_var,
                                            max_var,
                                            per_channel=per_channel,
                                            num_bits=num_bits,
                                            narrow_range=narrow_range)

        if per_channel:
            if input_dim == 2:
                reduce_dims = [0]
            elif input_dim == 4:
                reduce_dims = [0, 1, 2]

        if per_channel:
            if input_dim >= 2:
                batch_min = math_ops.reduce_min(inputs,
                                                reduction_indices=reduce_dims,
                                                name='BatchMin')
            else:
                batch_min = inputs
        else:
            batch_min = math_ops.reduce_min(inputs, name='BatchMin')
        # TFLite requires that 0.0 if always in the [min; max] range.
        batch_min = math_ops.minimum(batch_min, 0.0)
        assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
        ops.add_to_collection(updates_collection, assign_min.op)

        if per_channel:
            if input_dim >= 2:
                batch_max = math_ops.reduce_max(inputs,
                                                reduction_indices=reduce_dims,
                                                name='BatchMax')
            else:
                batch_max = inputs
        else:
            batch_max = math_ops.reduce_max(inputs, name='BatchMax')
        # TFLite requires that 0.0 if always in the [min; max] range.
        batch_max = math_ops.maximum(batch_max, 0.0)
        assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
        ops.add_to_collection(updates_collection, assign_max.op)

        return _FakeQuantWithMinMaxVars(inputs,
                                        assign_min,
                                        assign_max,
                                        per_channel=per_channel,
                                        num_bits=num_bits,
                                        narrow_range=narrow_range)
def stateless_random_gamma(shape,
                           seed,
                           alpha,
                           beta=None,
                           dtype=dtypes.float32,
                           name=None):
    """Outputs deterministic pseudorandom values from a gamma distribution.

  The generated values follow a gamma distribution with specified concentration
  (`alpha`) and inverse scale (`beta`) parameters.

  This is a stateless version of `tf.random.gamma`: if run twice with the same
  seeds and shapes, it will produce the same pseudorandom numbers. The output is
  consistent across multiple runs on the same hardware (and between CPU and
  GPU),
  but may change between versions of TensorFlow or on non-CPU/GPU hardware.

  A slight difference exists in the interpretation of the `shape` parameter
  between `stateless_gamma` and `gamma`: in `gamma`, the `shape` is always
  prepended to the shape of the broadcast of `alpha` with `beta`; whereas in
  `stateless_gamma` the `shape` parameter must always encompass the shapes of
  each of `alpha` and `beta` (which must broadcast together to match the
  trailing dimensions of `shape`).

  Note: Because internal calculations are done using `float64` and casting has
  `floor` semantics, we must manually map zero outcomes to the smallest
  possible positive floating-point value, i.e., `np.finfo(dtype).tiny`.  This
  means that `np.finfo(dtype).tiny` occurs more frequently than it otherwise
  should.  This bias can only happen for small values of `alpha`, i.e.,
  `alpha << 1` or large values of `beta`, i.e., `beta >> 1`.

  The samples are differentiable w.r.t. alpha and beta.
  The derivatives are computed using the approach described in
  (Figurnov et al., 2018).

  Example:

  ```python
  samples = tf.random.stateless_gamma([10, 2], seed=[12, 34], alpha=[0.5, 1.5])
  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
  # the samples drawn from each distribution

  samples = tf.random.stateless_gamma([7, 5, 2], seed=[12, 34], alpha=[.5, 1.5])
  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
  # represents the 7x5 samples drawn from each of the two distributions

  alpha = tf.constant([[1.], [3.], [5.]])
  beta = tf.constant([[3., 4.]])
  samples = tf.random.stateless_gamma(
      [30, 3, 2], seed=[12, 34], alpha=alpha, beta=beta)
  # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.

  with tf.GradientTape() as tape:
    tape.watch([alpha, beta])
    loss = tf.reduce_mean(tf.square(tf.random.stateless_gamma(
        [30, 3, 2], seed=[12, 34], alpha=alpha, beta=beta)))
  dloss_dalpha, dloss_dbeta = tape.gradient(loss, [alpha, beta])
  # unbiased stochastic derivatives of the loss function
  alpha.shape == dloss_dalpha.shape  # True
  beta.shape == dloss_dbeta.shape  # True
  ```

  Args:
    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
    seed: A shape [2] Tensor, the seed to the random number generator. Must have
      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
    alpha: Tensor. The concentration parameter of the gamma distribution. Must
      be broadcastable with `beta`, and broadcastable with the rightmost
      dimensions of `shape`.
    beta: Tensor. The inverse scale parameter of the gamma distribution. Must be
      broadcastable with `alpha` and broadcastable with the rightmost dimensions
      of `shape`.
    dtype: Floating point dtype of `alpha`, `beta`, and the output.
    name: A name for the operation (optional).

  Returns:
    samples: A Tensor of the specified shape filled with random gamma values.
      For each i, each `samples[..., i] is an independent draw from the gamma
      distribution with concentration alpha[i] and scale beta[i].

  """
    with ops.name_scope(name, "stateless_random_gamma",
                        [shape, seed, alpha, beta]) as name:
        shape = tensor_util.shape_tensor(shape)
        alpha = ops.convert_to_tensor(alpha, dtype=dtype, name="alpha")
        beta = ops.convert_to_tensor(beta if beta is not None else 1,
                                     name="beta",
                                     dtype=dtype)
        broadcast_shape = array_ops.broadcast_dynamic_shape(
            array_ops.shape(alpha), array_ops.shape(beta))
        alpha_broadcast = array_ops.broadcast_to(alpha, broadcast_shape)
        result = math_ops.maximum(
            np.finfo(alpha.dtype.as_numpy_dtype).tiny,
            gen_stateless_random_ops.stateless_random_gamma_v2(
                shape, seed=seed, alpha=alpha_broadcast) / beta)
        tensor_util.maybe_set_static_shape(result, shape)
        return result
def frame(signal,
          frame_length,
          frame_step,
          pad_end=False,
          pad_value=0,
          axis=-1,
          name=None):
    """Expands `signal`'s `axis` dimension into frames of `frame_length`.

  Slides a window of size `frame_length` over `signal`'s `axis` dimension
  with a stride of `frame_step`, replacing the `axis` dimension with
  `[frames, frame_length]` frames.

  If `pad_end` is True, window positions that are past the end of the `axis`
  dimension are padded with `pad_value` until the window moves fully past the
  end of the dimension. Otherwise, only window positions that fully overlap the
  `axis` dimension are produced.

  For example:

  ```python
  # A batch size 3 tensor of 9152 audio samples.
  audio = tf.random.normal([3, 9152])

  # Compute overlapping frames of length 512 with a step of 180 (frames overlap
  # by 332 samples). By default, only 50 frames are generated since the last
  # 152 samples do not form a full frame.
  frames = tf.signal.frame(audio, 512, 180)
  frames.shape.assert_is_compatible_with([3, 50, 512])

  # When pad_end is enabled, the final frame is kept (padded with zeros).
  frames = tf.signal.frame(audio, 512, 180, pad_end=True)
  frames.shape.assert_is_compatible_with([3, 51, 512])
  ```

  Args:
    signal: A `[..., samples, ...]` `Tensor`. The rank and dimensions
      may be unknown. Rank must be at least 1.
    frame_length: The frame length in samples. An integer or scalar `Tensor`.
    frame_step: The frame hop size in samples. An integer or scalar `Tensor`.
    pad_end: Whether to pad the end of `signal` with `pad_value`.
    pad_value: An optional scalar `Tensor` to use where the input signal
      does not exist when `pad_end` is True.
    axis: A scalar integer `Tensor` indicating the axis to frame. Defaults to
      the last axis. Supports negative values for indexing from the end.
    name: An optional name for the operation.

  Returns:
    A `Tensor` of frames with shape `[..., frames, frame_length, ...]`.

  Raises:
    ValueError: If `frame_length`, `frame_step`, `pad_value`, or `axis` are not
      scalar.
  """
    with ops.name_scope(name, "frame",
                        [signal, frame_length, frame_step, pad_value]):
        signal = ops.convert_to_tensor(signal, name="signal")
        frame_length = ops.convert_to_tensor(frame_length, name="frame_length")
        frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
        axis = ops.convert_to_tensor(axis, name="axis")

        signal.shape.with_rank_at_least(1)
        frame_length.shape.assert_has_rank(0)
        frame_step.shape.assert_has_rank(0)
        axis.shape.assert_has_rank(0)

        result_shape = _infer_frame_shape(signal, frame_length, frame_step,
                                          pad_end, axis)

        def maybe_constant(val):
            val_static = tensor_util.constant_value(val)
            return (val_static, True) if val_static is not None else (val,
                                                                      False)

        signal_shape, signal_shape_is_static = maybe_constant(
            array_ops.shape(signal))
        axis, axis_is_static = maybe_constant(axis)

        if signal_shape_is_static and axis_is_static:
            # Axis can be negative. Convert it to positive.
            axis = range(len(signal_shape))[axis]
            outer_dimensions, length_samples, inner_dimensions = np.split(
                signal_shape, indices_or_sections=[axis, axis + 1])
            length_samples = length_samples.item()
        else:
            signal_rank = array_ops.rank(signal)
            # Axis can be negative. Convert it to positive.
            axis = math_ops.range(signal_rank)[axis]
            outer_dimensions, length_samples, inner_dimensions = array_ops.split(
                signal_shape, [axis, 1, signal_rank - 1 - axis])
            length_samples = array_ops.reshape(length_samples, [])
        num_outer_dimensions = array_ops.size(outer_dimensions)
        num_inner_dimensions = array_ops.size(inner_dimensions)

        # If padding is requested, pad the input signal tensor with pad_value.
        if pad_end:
            pad_value = ops.convert_to_tensor(pad_value, signal.dtype)
            pad_value.shape.assert_has_rank(0)

            # Calculate number of frames, using double negatives to round up.
            num_frames = -(-length_samples // frame_step)

            # Pad the signal by up to frame_length samples based on how many samples
            # are remaining starting from last_frame_position.
            pad_samples = math_ops.maximum(
                0,
                frame_length + frame_step * (num_frames - 1) - length_samples)

            # Pad the inner dimension of signal by pad_samples.
            paddings = array_ops.concat([
                array_ops.zeros([num_outer_dimensions, 2],
                                dtype=pad_samples.dtype), [[0, pad_samples]],
                array_ops.zeros([num_inner_dimensions, 2],
                                dtype=pad_samples.dtype)
            ], 0)
            signal = array_ops.pad(signal, paddings, constant_values=pad_value)

            signal_shape = array_ops.shape(signal)
            length_samples = signal_shape[axis]
        else:
            num_frames = math_ops.maximum(
                0, 1 + (length_samples - frame_length) // frame_step)

        subframe_length, _ = maybe_constant(
            util_ops.gcd(frame_length, frame_step))
        subframes_per_frame = frame_length // subframe_length
        subframes_per_hop = frame_step // subframe_length
        num_subframes = length_samples // subframe_length

        slice_shape = array_ops.concat([
            outer_dimensions, [num_subframes * subframe_length],
            inner_dimensions
        ], 0)
        subframe_shape = array_ops.concat([
            outer_dimensions, [num_subframes, subframe_length],
            inner_dimensions
        ], 0)
        subframes = array_ops.reshape(
            array_ops.strided_slice(signal, array_ops.zeros_like(signal_shape),
                                    slice_shape), subframe_shape)

        # frame_selector is a [num_frames, subframes_per_frame] tensor
        # that indexes into the appropriate frame in subframes. For example:
        # [[0, 0, 0, 0], [2, 2, 2, 2], [4, 4, 4, 4]]
        frame_selector = array_ops.reshape(
            math_ops.range(num_frames) * subframes_per_hop, [num_frames, 1])

        # subframe_selector is a [num_frames, subframes_per_frame] tensor
        # that indexes into the appropriate subframe within a frame. For example:
        # [[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]
        subframe_selector = array_ops.reshape(
            math_ops.range(subframes_per_frame), [1, subframes_per_frame])

        # Adding the 2 selector tensors together produces a [num_frames,
        # subframes_per_frame] tensor of indices to use with tf.gather to select
        # subframes from subframes. We then reshape the inner-most
        # subframes_per_frame dimension to stitch the subframes together into
        # frames. For example: [[0, 1, 2, 3], [2, 3, 4, 5], [4, 5, 6, 7]].
        selector = frame_selector + subframe_selector

        frames = array_ops.reshape(
            array_ops.gather(subframes, selector, axis=axis),
            array_ops.concat([
                outer_dimensions, [num_frames, frame_length], inner_dimensions
            ], 0))

        if result_shape:
            frames.set_shape(result_shape)
        return frames
Example #43
0
def lifted_struct_loss(labels, embeddings, margin=1.0):
    """Computes the lifted structured loss.
      The loss encourages the positive distances (between a pair of embeddings
      with the same labels) to be smaller than any negative distances (between a
      pair of embeddings with different labels) in the mini-batch in a way
      that is differentiable with respect to the embedding vectors.
      See: https://arxiv.org/abs/1511.06452.
      Args:
        labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
          multiclass integer labels.
        embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should not
          be l2 normalized.
        margin: Float, margin term in the loss definition.
      Returns:
        lifted_loss: tf.float32 scalar.
    """
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pairwise_distances = pairwise_distance(embeddings)

    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    diff = margin - pairwise_distances
    mask = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    # Safe maximum: Temporarily shift negative distances
    #   above zero before taking max.
    #     this is to take the max only among negatives.
    row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True)
    row_negative_maximums = math_ops.reduce_max(
        math_ops.multiply(diff - row_minimums,
                          mask), 1, keep_dims=True) + row_minimums

    max_elements = math_ops.maximum(row_negative_maximums,
                                    array_ops.transpose(row_negative_maximums))
    diff_tiled = array_ops.tile(diff, [batch_size, 1])
    mask_tiled = array_ops.tile(mask, [batch_size, 1])
    max_elements_vect = array_ops.reshape(array_ops.transpose(max_elements),
                                          [-1, 1])

    loss_exp_left = array_ops.reshape(
        math_ops.reduce_sum(math_ops.multiply(
            math_ops.exp(diff_tiled - max_elements_vect), mask_tiled),
                            1,
                            keep_dims=True), [batch_size, batch_size])

    loss_mat = max_elements + math_ops.log(loss_exp_left +
                                           array_ops.transpose(loss_exp_left))
    # Add the positive distance.
    loss_mat += pairwise_distances

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # *0.5 for upper triangular, and another *0.5 for 1/2 factor for loss^2.
    num_positives = math_ops.reduce_sum(mask_positives) / 2.0

    lifted_loss = math_ops.truediv(0.25 * math_ops.reduce_sum(
        math_ops.square(
            math_ops.maximum(math_ops.multiply(loss_mat, mask_positives),
                             0.0))),
                                   num_positives,
                                   name='liftedstruct_loss')
    return lifted_loss
 def max_(x, y):
     if _is_tensor(x) or _is_tensor(y):
         return math_ops.maximum(x, y)
     else:
         return max(x, y)
Example #45
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0, and decay=1 meaning no updates.
        r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
        d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
        decay = _smart_select(training, lambda: self.renorm_momentum,
                              lambda: 1.)

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            # Update the variables without zero debiasing. The debiasing will be
            # accomplished by dividing the exponential moving average by the weight.
            # For example, after a single update, the moving average would be
            # (1-decay) * value. and the weight will be 1-decay, with their ratio
            # giving value.
            # Make sure the weight is not updated until before r and d computation.
            value = array_ops.identity(value)
            with ops.control_dependencies([value]):
                weight_value = array_ops.constant(1., dtype=weight.dtype)
            new_var = moving_averages.assign_moving_average(var,
                                                            value,
                                                            decay,
                                                            zero_debias=False)
            new_weight = moving_averages.assign_moving_average(
                weight, weight_value, decay, zero_debias=False)
            return new_var / new_weight

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Example #46
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0.
        r = tf_utils.smart_cond(training, lambda: r,
                                lambda: array_ops.ones_like(r))
        d = tf_utils.smart_cond(training, lambda: d,
                                lambda: array_ops.zeros_like(d))

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            value = array_ops.identity(value)

            def _do_update():
                """Updates the var and weight, returns their updated ratio."""
                # Update the variables without zero debiasing. The debiasing will be
                # accomplished by dividing the exponential moving average by the weight.
                # For example, after a single update, the moving average would be
                # (1-decay) * value. and the weight will be 1-decay, with their ratio
                # giving the value.
                # Make sure the weight is not updated until before r and d computation.
                with ops.control_dependencies([value]):
                    weight_value = array_ops.constant(1., dtype=weight.dtype)
                new_var = self._assign_moving_average(var, value,
                                                      self.renorm_momentum)
                new_weight = self._assign_moving_average(
                    weight, weight_value, self.renorm_momentum)
                # TODO(yuefengz): the updates to var and weighted can not be batched
                # together if we fetch their updated values here. Consider calculating
                # new values and delaying the updates.
                return new_var / new_weight

            def _fake_update():
                return array_ops.identity(var)

            return tf_utils.smart_cond(training, _do_update, _fake_update)

        # TODO(yuefengz): colocate the operations
        new_mean = _update_renorm_variable(self.renorm_mean,
                                           self.renorm_mean_weight, mean)
        new_stddev = _update_renorm_variable(self.renorm_stddev,
                                             self.renorm_stddev_weight, stddev)
        # Make sqrt(moving_variance + epsilon) = new_stddev.
        new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Example #47
0
def confusion_matrix(labels,
                     predictions,
                     num_classes=None,
                     dtype=dtypes.int32,
                     name=None,
                     weights=None):
    """Computes the confusion matrix from predictions and labels.

  Calculate the Confusion Matrix for a pair of prediction and
  label 1-D int arrays.

  The matrix rows represent the prediction labels and the columns
  represents the real labels. The confusion matrix is always a 2-D array
  of shape `[n, n]`, where `n` is the number of valid labels for a given
  classification task. Both prediction and labels must be 1-D arrays of
  the same shape in order for this function to work.

  If `num_classes` is None, then `num_classes` will be set to the one plus
  the maximum value in either predictions or labels.
  Class labels are expected to start at 0. E.g., if `num_classes` was
  three, then the possible labels would be `[0, 1, 2]`.

  If `weights` is not `None`, then each prediction contributes its
  corresponding weight to the total value of the confusion matrix cell.

  For example:

  ```python
    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
        [[0 0 0 0 0]
         [0 0 1 0 0]
         [0 0 1 0 0]
         [0 0 0 0 0]
         [0 0 0 0 1]]
  ```

  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
  resulting in a 5x5 confusion matrix.

  Args:
    labels: A 1-D representing the real labels for the classification task.
    predictions: A 1-D array representing the predictions for a given
                 classification.
    num_classes: The possible number of labels the classification task can
                 have. If this value is not provided, it will be calculated
                 using both predictions and labels array.
    dtype: Data type of the confusion matrix.
    name: Scope name.
    weights: An optional `Tensor` whose shape matches `predictions`.

  Returns:
    A k X k matrix representing the confusion matrix, where k is the number of
    possible labels in the classification task.

  Raises:
    ValueError: If both predictions and labels are not 1-D vectors and have
      mismatched shapes, or if `weights` is not `None` and its shape doesn't
      match `predictions`.
  """
    with ops.name_scope(name, 'confusion_matrix',
                        [predictions, labels, num_classes]) as name:
        labels, predictions = remove_squeezable_dimensions(
            ops.convert_to_tensor(labels, name='labels'),
            ops.convert_to_tensor(predictions, name='predictions'))
        predictions = math_ops.cast(predictions, dtypes.int64)
        labels = math_ops.cast(labels, dtypes.int64)

        if num_classes is None:
            num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
                                           math_ops.reduce_max(labels)) + 1

        if weights is not None:
            predictions.get_shape().assert_is_compatible_with(
                weights.get_shape())
            weights = math_ops.cast(weights, dtype)

        shape = array_ops.pack([num_classes, num_classes])
        indices = array_ops.transpose(array_ops.pack([predictions, labels]))
        values = (array_ops.ones_like(predictions, dtype)
                  if weights is None else weights)
        cm_sparse = sparse_tensor.SparseTensor(indices=indices,
                                               values=values,
                                               shape=math_ops.to_int64(shape))
        zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)

        return sparse_ops.sparse_add(zero_matrix, cm_sparse)
Example #48
0
def shuffle_batch_join(tensors_list,
                       batch_size,
                       capacity,
                       min_after_dequeue,
                       seed=None,
                       enqueue_many=False,
                       shapes=None,
                       shared_name=None,
                       name=None):
    """Create batches by randomly shuffling tensors.

  The `tensors_list` argument is a list of tuples of tensors, or a list of
  dictionaries of tensors.  Each element in the list is treated similarily
  to the `tensors` argument of `tf.train.shuffle_batch()`.

  This version enqueues a different list of tensors in different threads.
  It adds the following to the current `Graph`:

  * A shuffling queue into which tensors from `tensors_list` are enqueued.
  * A `dequeue_many` operation to create batches from the queue.
  * A `QueueRunner` to `QUEUE_RUNNER` collection, to enqueue the tensors
    from `tensors_list`.

  `len(tensors_list)` threads will be started, with thread `i` enqueuing
  the tensors from `tensors_list[i]`. `tensors_list[i1][j]` must match
  `tensors_list[i2][j]` in type and shape, except in the first dimension if
  `enqueue_many` is true.

  If `enqueue_many` is `False`, each `tensors_list[i]` is assumed
  to represent a single example.  An input tensor with shape `[x, y, z]`
  will be output as a tensor with shape `[batch_size, x, y, z]`.

  If `enqueue_many` is `True`, `tensors_list[i]` is assumed to
  represent a batch of examples, where the first dimension is indexed
  by example, and all members of `tensors_list[i]` should have the
  same size in the first dimension.  If an input tensor has shape `[*, x,
  y, z]`, the output will have shape `[batch_size, x, y, z]`.

  The `capacity` argument controls the how long the prefetching is allowed to
  grow the queues.

  The returned operation is a dequeue operation and will throw
  `tf.errors.OutOfRangeError` if the input queue is exhausted. If this
  operation is feeding another input queue, its queue runner will catch
  this exception, however, if this operation is used in your main thread
  you are responsible for catching this yourself.

  Args:
    tensors_list: A list of tuples or dictionaries of tensors to enqueue.
    batch_size: An integer. The new batch size pulled from the queue.
    capacity: An integer. The maximum number of elements in the queue.
    min_after_dequeue: Minimum number elements in the queue after a
      dequeue, used to ensure a level of mixing of elements.
    seed: Seed for the random shuffling within the queue.
    enqueue_many: Whether each tensor in `tensor_list_list` is a single
      example.
    shapes: (Optional) The shapes for each example.  Defaults to the
      inferred shapes for `tensors_list[i]`.
    shared_name: (optional). If set, this queue will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A list or dictionary of tensors with the same number and types as
    `tensors_list[i]`.

  Raises:
    ValueError: If the `shapes` are not specified, and cannot be
      inferred from the elements of `tensors_list`.
  """
    tensor_list_list = _as_tensor_list_list(tensors_list)
    with ops.op_scope(_flatten(tensor_list_list), name,
                      "shuffle_batch_join") as name:
        tensor_list_list = _validate_join(tensor_list_list)
        tensor_list_list, sparse_info = _serialize_sparse_tensors_join(
            tensor_list_list, enqueue_many)
        types = _dtypes(tensor_list_list)
        shapes = _shapes(tensor_list_list, shapes, enqueue_many)
        queue = data_flow_ops.RandomShuffleQueue(
            capacity=capacity,
            min_after_dequeue=min_after_dequeue,
            seed=seed,
            dtypes=types,
            shapes=shapes,
            shared_name=shared_name)
        _enqueue_join(queue, tensor_list_list, enqueue_many)
        full = (math_ops.cast(
            math_ops.maximum(0,
                             queue.size() - min_after_dequeue), dtypes.float32)
                * (1. / (capacity - min_after_dequeue)))
        # Note that name contains a '/' at the end so we intentionally do not place
        # a '/' after %s below.
        summary_name = (
            "queue/%sfraction_over_%d_of_%d_full" %
            (name, min_after_dequeue, capacity - min_after_dequeue))
        logging_ops.scalar_summary(summary_name, full)

        dequeued = queue.dequeue_many(batch_size, name=name)
        dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
        # tensors_list was validated to not be empty.
        return _as_original_type(tensors_list[0], dequeued)
Example #49
0
def categorical_hinge(y_true, y_pred):
    pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
    neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
    return math_ops.maximum(0., neg - pos + 1.)
Example #50
0
def shuffle_batch(tensors,
                  batch_size,
                  capacity,
                  min_after_dequeue,
                  num_threads=1,
                  seed=None,
                  enqueue_many=False,
                  shapes=None,
                  shared_name=None,
                  name=None):
    """Creates batches by randomly shuffling tensors.

  This function adds the following to the current `Graph`:

  * A shuffling queue into which tensors from `tensors` are enqueued.
  * A `dequeue_many` operation to create batches from the queue.
  * A `QueueRunner` to `QUEUE_RUNNER` collection, to enqueue the tensors
    from `tensors`.

  If `enqueue_many` is `False`, `tensors` is assumed to represent a
  single example.  An input tensor with shape `[x, y, z]` will be output
  as a tensor with shape `[batch_size, x, y, z]`.

  If `enqueue_many` is `True`, `tensors` is assumed to represent a
  batch of examples, where the first dimension is indexed by example,
  and all members of `tensors` should have the same size in the
  first dimension.  If an input tensor has shape `[*, x, y, z]`, the
  output will have shape `[batch_size, x, y, z]`.

  The `capacity` argument controls the how long the prefetching is allowed to
  grow the queues.

  The returned operation is a dequeue operation and will throw
  `tf.errors.OutOfRangeError` if the input queue is exhausted. If this
  operation is feeding another input queue, its queue runner will catch
  this exception, however, if this operation is used in your main thread
  you are responsible for catching this yourself.

  For example:

  ```python
  # Creates batches of 32 images and 32 labels.
  image_batch, label_batch = tf.train.shuffle_batch(
        [single_image, single_label],
        batch_size=32,
        num_threads=4,
        capacity=50000,
        min_after_dequeue=10000)
  ```

  *N.B.:* You must ensure that either (i) the `shapes` argument is
  passed, or (ii) all of the tensors in `tensors` must have
  fully-defined shapes. `ValueError` will be raised if neither of
  these conditions holds.

  Args:
    tensors: The list or dictionary of tensors to enqueue.
    batch_size: The new batch size pulled from the queue.
    capacity: An integer. The maximum number of elements in the queue.
    min_after_dequeue: Minimum number elements in the queue after a
      dequeue, used to ensure a level of mixing of elements.
    num_threads: The number of threads enqueuing `tensor_list`.
    seed: Seed for the random shuffling within the queue.
    enqueue_many: Whether each tensor in `tensor_list` is a single example.
    shapes: (Optional) The shapes for each example.  Defaults to the
      inferred shapes for `tensor_list`.
    shared_name: (Optional) If set, this queue will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A list or dictionary of tensors with the types as `tensors`.

  Raises:
    ValueError: If the `shapes` are not specified, and cannot be
      inferred from the elements of `tensors`.
  """
    tensor_list = _as_tensor_list(tensors)
    with ops.op_scope(tensor_list, name, "shuffle_batch") as name:
        tensor_list = _validate(tensor_list)
        tensor_list, sparse_info = _serialize_sparse_tensors(
            tensor_list, enqueue_many)
        types = _dtypes([tensor_list])
        shapes = _shapes([tensor_list], shapes, enqueue_many)
        queue = data_flow_ops.RandomShuffleQueue(
            capacity=capacity,
            min_after_dequeue=min_after_dequeue,
            seed=seed,
            dtypes=types,
            shapes=shapes,
            shared_name=shared_name)
        _enqueue(queue, tensor_list, num_threads, enqueue_many)
        full = (math_ops.cast(
            math_ops.maximum(0,
                             queue.size() - min_after_dequeue), dtypes.float32)
                * (1. / (capacity - min_after_dequeue)))
        # Note that name contains a '/' at the end so we intentionally do not place
        # a '/' after %s below.
        summary_name = (
            "queue/%sfraction_over_%d_of_%d_full" %
            (name, min_after_dequeue, capacity - min_after_dequeue))
        logging_ops.scalar_summary(summary_name, full)

        dequeued = queue.dequeue_many(batch_size, name=name)
        dequeued = _deserialize_sparse_tensors(dequeued, sparse_info)
        return _as_original_type(tensors, dequeued)
Example #51
0
def hinge(y_true, y_pred):
    return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
Example #52
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = []

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (  # pylint: disable=g-no-augmented-assignment
                1. / (1. + self.decay *
                      math_ops.cast(self.iterations, K.dtype(self.decay))))

        with ops.control_dependencies(
            [state_ops.assign_add(self.iterations, 1)]):
            t = math_ops.cast(self.iterations, K.floatx())
        self.updates.append(state_ops.assign_add(self.t_cur, 1))

        lr_t = lr * (math_ops.sqrt(1. - math_ops.pow(self.beta_2, t)) /
                     (1. - math_ops.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        total_iterations = self.total_iterations
        # Cosine annealing
        if self.use_cosine_annealing and total_iterations != 0:
            self.eta_t = _compute_eta_t(self)
        self.lr_t = lr_t * self.eta_t  # for external tracking

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            # Learning rate multipliers
            if self.lr_multipliers is not None:
                lr_t = _apply_lr_multiplier(self, lr_t, p, force_eager=True)

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
            if self.amsgrad:
                vhat_t = math_ops.maximum(vhat, v_t)
                p_t = p - self.eta_t * lr_t * m_t / (math_ops.sqrt(vhat_t) +
                                                     self.epsilon)
                self.updates.append(state_ops.assign(vhat, vhat_t))
            else:
                p_t = p - self.eta_t * lr_t * m_t / (math_ops.sqrt(v_t) +
                                                     self.epsilon)

            self.updates.append(state_ops.assign(m, m_t))
            self.updates.append(state_ops.assign(v, v_t))

            # Weight decays
            if p.name in self.weight_decays.keys() and total_iterations != 0:
                p_t = _apply_weight_decays(self, p, p_t, force_eager=True)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(state_ops.assign(p, new_p))
        return self.updates
Example #53
0
def ragged_reduce_aggregate(reduce_op,
                            unsorted_segment_op,
                            rt_input,
                            axis,
                            keepdims,
                            separator=None,
                            name=None):
    """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.

  Reduces `rt_input` along the dimensions given in `axis`.  The rank of the
  tensor is reduced by 1 for each entry in `axis`.  If `axis` is not specified,
  then all dimensions are reduced, and a scalar value is returned.

  This op assumes that `reduce_op` and `unsorted_segment_op` are associative;
  if not, then reducing multiple axes will return incorrect results.  (In
  particular, reducing multiple axes is currently implemented by reducing the
  axes one at a time.)

  Args:
    reduce_op: The tensorflow `op` that should be used to reduce values in
      uniform dimensions.  Must have the same signature and basic behavior as
      `reduce_sum`, `reduce_max`, etc.
    unsorted_segment_op: The tensorflow `op` that should be used to combine
      values in ragged dimensions.  Must have the same signature and basic
      behavior as `unsorted_segment_sum`, `unsorted_segment_max`, etc.
    rt_input: A `Tensor` or `RaggedTensor` containing the values to be reduced.
    axis: The axis or axes to reduce.  May be `None` (to reduce all axes), an
      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce a
      given set of axes), or a `Tensor` with a constant value.  Must be in the
      range `[0, rt_input.rank)`.
    keepdims: If true, retains reduced dimensions with length 1.
    separator: An optional string. Defaults to None. The separator to use when
      joining. The separator must not be set for non-string data types. (i.e. if
      separator is not None then it uses string ops)
    name: A name prefix for the returned tensor (optional).

  Returns:
    A `RaggedTensor` containing the reduced values.  The returned tensor
    has the same dtype as `data`, and its shape is given by removing the
    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
    of the returned tensor is given by substracting any ragged dimensions
    specified in `axis` from `rt_input.ragged_rank`.
  Raises:
    ValueError: If `axis` contains a `Tensor` whose value is not constant.
  """
    if not ragged_tensor.is_ragged(rt_input):
        if separator is None:
            return reduce_op(rt_input, axis, keepdims=keepdims, name=name)
        else:
            # When separator is not None, We infer that dtype is string and
            # reduce_join will be called.
            return reduce_op(rt_input,
                             axis,
                             keepdims=keepdims,
                             name=name,
                             separator=separator)

    if isinstance(axis, ops.Tensor):
        axis = tensor_util.constant_value(axis)
        if axis is None:
            raise ValueError('axis must be known at graph construction time.')
        if isinstance(axis, np.ndarray):
            axis = axis.tolist()

    # When reducing all axes, just ignore splits & reduce the inner values.
    if axis is None:
        result = reduce_op(rt_input.flat_values,
                           None,
                           keepdims=keepdims,
                           name=name)
        if keepdims:
            # Expand the result to the input number of dimensions.
            for _ in rt_input.shape[1:]:
                result = array_ops.expand_dims(result, axis=0)
        return result

    with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
        if isinstance(axis, (tuple, list)):
            if not axis:
                return rt_input
            elif len(axis) == 1:
                axis = axis[0]
            else:
                # When reducing multiple axes, as we reduce one at a time (see below),
                # the negative axis has to be converted to positive at the first run
                # as the sort with negative axis will have different orders.
                # See GitHub issue 27497.
                axis = [
                    array_ops.get_positive_axis(a, rt_input.shape.ndims,
                                                'axis[%s]' % i,
                                                'rank(input_tensor)')
                    for i, a in enumerate(axis)
                ]
                # When reducing multiple axes, just reduce one at a time.  This is less
                # efficient, and only works for associative ops.  (In particular, it
                # does not work for reduce_mean.)  However, reducing multiple axes at
                # once will probably require a nontrivial c++ op.
                axis = sorted(axis)
                inner_reduced = ragged_reduce_aggregate(
                    reduce_op, unsorted_segment_op, rt_input, axis[-1],
                    keepdims, separator)
                return ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
                                               inner_reduced, axis[:-1],
                                               keepdims, separator)

        rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            rt_input, name='rt_input')

        axis = array_ops.get_positive_axis(axis,
                                           rt_input.shape.ndims,
                                           ndims_name='rank(input_tensor)')

        if axis == 0:
            # out[i_1, i_2, ..., i_N] = sum_{j} rt_input[j, i_1, i_2, ..., i_N]
            row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
            num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths),
                                            0)
            segment_ids = range(row_lengths).values
            result = _ragged_segment_aggregate(unsorted_segment_op,
                                               rt_input.values, segment_ids,
                                               num_segments, separator)
            if keepdims:
                result = array_ops.expand_dims(result, axis=0)
            return result
        elif axis == 1:
            # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N]
            num_segments = array_ops.shape(rt_input.row_splits)[0] - 1
            segment_ids = segment_id_ops.row_splits_to_segment_ids(
                rt_input.row_splits)
            result = _ragged_segment_aggregate(unsorted_segment_op,
                                               rt_input.values, segment_ids,
                                               num_segments, separator)
            if keepdims:
                result = array_ops.expand_dims(result, axis=1)
            return result
        else:
            # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] =
            #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
            return rt_input.with_values(
                ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
                                        rt_input.values, axis - 1, keepdims,
                                        separator))
Example #54
0
def random_gamma(shape,
                 alpha,
                 beta=None,
                 dtype=dtypes.float32,
                 seed=None,
                 name=None):
    """Draws `shape` samples from each of the given Gamma distribution(s).

  `alpha` is the shape parameter describing the distribution(s), and `beta` is
  the inverse scale parameter(s).

  Note: Because internal calculations are done using `float64` and casting has
  `floor` semantics, we must manually map zero outcomes to the smallest
  possible positive floating-point value, i.e., `np.finfo(dtype).tiny`.  This
  means that `np.finfo(dtype).tiny` occurs more frequently than it otherwise
  should.  This bias can only happen for small values of `alpha`, i.e.,
  `alpha << 1` or large values of `beta`, i.e., `beta >> 1`.

  The samples are differentiable w.r.t. alpha and beta.
  The derivatives are computed using the approach described in the paper

  [Michael Figurnov, Shakir Mohamed, Andriy Mnih.
  Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498)

  Example:

  ```python
  samples = tf.random.gamma([10], [0.5, 1.5])
  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
  # the samples drawn from each distribution

  samples = tf.random.gamma([7, 5], [0.5, 1.5])
  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
  # represents the 7x5 samples drawn from each of the two distributions

  alpha = tf.constant([[1.],[3.],[5.]])
  beta = tf.constant([[3., 4.]])
  samples = tf.random.gamma([30], alpha=alpha, beta=beta)
  # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.

  loss = tf.reduce_mean(tf.square(samples))
  dloss_dalpha, dloss_dbeta = tf.gradients(loss, [alpha, beta])
  # unbiased stochastic derivatives of the loss function
  alpha.shape == dloss_dalpha.shape  # True
  beta.shape == dloss_dbeta.shape  # True
  ```

  Args:
    shape: A 1-D integer Tensor or Python array. The shape of the output samples
      to be drawn per alpha/beta-parameterized distribution.
    alpha: A Tensor or Python value or N-D array of type `dtype`. `alpha`
      provides the shape parameter(s) describing the gamma distribution(s) to
      sample. Must be broadcastable with `beta`.
    beta: A Tensor or Python value or N-D array of type `dtype`. Defaults to 1.
      `beta` provides the inverse scale parameter(s) of the gamma
      distribution(s) to sample. Must be broadcastable with `alpha`.
    dtype: The type of alpha, beta, and the output: `float16`, `float32`, or
      `float64`.
    seed: A Python integer. Used to create a random seed for the distributions.
      See
      `tf.compat.v1.set_random_seed`
      for behavior.
    name: Optional name for the operation.

  Returns:
    samples: a `Tensor` of shape
      `tf.concat([shape, tf.shape(alpha + beta)], axis=0)` with values of type
      `dtype`.
  """
    with ops.name_scope(name, "random_gamma", [shape, alpha, beta]):
        shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32)
        alpha = ops.convert_to_tensor(alpha, name="alpha", dtype=dtype)
        beta = ops.convert_to_tensor(beta if beta is not None else 1,
                                     name="beta",
                                     dtype=dtype)
        alpha_broadcast = alpha + array_ops.zeros_like(beta)
        seed1, seed2 = random_seed.get_seed(seed)
        return math_ops.maximum(
            np.finfo(dtype.as_numpy_dtype).tiny,
            gen_random_ops.random_gamma(
                shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
Example #55
0
def _ragged_segment_aggregate(unsorted_segment_op,
                              data,
                              segment_ids,
                              num_segments,
                              separator=None,
                              name=None):
    """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.

  Returns a RaggedTensor `output` with `num_segments` rows, where the row
  `output[i]` is formed by combining all rows of `data` whose corresponding
  `segment_id` is `i`.  The values in each row are combined using
  `unsorted_segment_op`.

  The length of the row `output[i]` will be the maximum of the lengths of
  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
  rows correspond to a given segment ID, then the output row for that segment
  ID will be empty.

  Args:
    unsorted_segment_op: The tensorflow `op` that should be used to combine
      values in each row.  Must have the same signature and basic behavior as
      `unsorted_segment_sum`, `unsorted_segment_max`, etc.
    data: A `RaggedTensor` containing the values to be combined.
    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
      `segment_ids` is not required to be sorted.
    num_segments: An `int32` or `int64` scalar.
    separator: An optional string. Defaults to None. The separator to use when
      joining. Only used for string types.
    name: A name prefix for the returned tensor (optional).

  Returns:
    A `RaggedTensor` containing the aggregated values.  The returned tensor
    has the same dtype as `data`, and its shape is
    `[num_segments] + data.shape[segment_ids.rank:]`.
  Raises:
    ValueError: If segment_ids.shape is not a prefix of data.shape.
  """
    if not (ragged_tensor.is_ragged(data)
            or ragged_tensor.is_ragged(segment_ids)):
        if separator is not None:
            # It uses unsorted_segment_join.
            return unsorted_segment_op(data, segment_ids, num_segments,
                                       separator, name)
        else:
            return unsorted_segment_op(data, segment_ids, num_segments, name)

    with ops.name_scope(name, 'RaggedSegment',
                        [data, segment_ids, num_segments]) as name:
        data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data,
                                                                name='data')
        segment_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            segment_ids, name='segment_ids')
        data, segment_ids = ragged_tensor.match_row_splits_dtypes(
            data, segment_ids)
        if segment_ids.dtype not in (dtypes.int32, dtypes.int64):
            raise ValueError('segment_ids must have dtype int32 or int64.')

        if ragged_tensor.is_ragged(segment_ids):
            if not ragged_tensor.is_ragged(data):
                raise ValueError(
                    'segment_ids.shape must be a prefix of data.shape, '
                    'but segment_ids is ragged and data is not.')
            check_splits = check_ops.assert_equal(
                segment_ids.row_splits,
                data.row_splits,
                message='segment_ids.shape must be a prefix of data.shape')
            with ops.control_dependencies([check_splits]):
                return _ragged_segment_aggregate(unsorted_segment_op,
                                                 data.values,
                                                 segment_ids.values,
                                                 num_segments, separator)

        # Find the length of each row in data.  (shape=[data_nrows])
        data_row_lengths = data.row_splits[1:] - data.row_splits[:-1]

        # Find the length that each output row will have.  The length of the row
        # corresponding to segment `id` is `max(data_row_lengths[i])` where
        # `segment_ids[i]=id`.  (shape=[output_nrows])
        output_row_lengths = math_ops.maximum(
            math_ops.unsorted_segment_max(data_row_lengths, segment_ids,
                                          num_segments), 0)

        # Build the splits tensor for the output RaggedTensor.
        output_splits = array_ops.concat([
            array_ops.zeros([1], output_row_lengths.dtype),
            math_ops.cumsum(output_row_lengths)
        ],
                                         axis=0)

        # For each row in `data`, find the start & limit position where that row's
        # values will be aggregated in output.values.
        data_row_to_out_row_start = array_ops.gather(output_splits,
                                                     segment_ids)
        data_row_to_out_row_limit = data_row_to_out_row_start + data_row_lengths

        # For each value in `data.values`, find the position where it will
        # aggregated in `output.values`.
        # Get the target output values index for each data values index.
        data_val_to_out_val_index = range(data_row_to_out_row_start,
                                          data_row_to_out_row_limit).values

        # Recursively aggregate the values.
        output_values = _ragged_segment_aggregate(unsorted_segment_op,
                                                  data.values,
                                                  data_val_to_out_val_index,
                                                  output_splits[-1], separator)
        return ragged_tensor.RaggedTensor.from_row_splits(output_values,
                                                          output_splits,
                                                          validate=False)
Example #56
0
    def _model_fn(features, labels, mode):
        """Function that returns predictions, training loss, and training op."""
        if (isinstance(features, ops.Tensor)
                or isinstance(features, sparse_tensor.SparseTensor)):
            features = {'features': features}
        weights = None
        if weights_name and weights_name in features:
            weights = features.pop(weights_name)

        keys = None
        if keys_name and keys_name in features:
            keys = features.pop(keys_name)

        # If we're doing eval, optionally ignore device_assigner.
        # Also ignore device assigner if we're exporting (mode == INFER)
        dev_assn = device_assigner
        if (mode == model_fn_lib.ModeKeys.INFER
                or (local_eval and mode == model_fn_lib.ModeKeys.EVAL)):
            dev_assn = None

        graph_builder = graph_builder_class(params, device_assigner=dev_assn)

        logits = graph_builder.inference_graph(features)
        # For binary classification problems, convert probabilities to logits.
        # Includes hack to get around the fact that a probability might be 0 or 1.
        if not params.regression and params.num_classes == 2:
            class_1_probs = array_ops.slice(logits, [0, 1], [-1, 1])
            logits = math_ops.log(
                math_ops.maximum(
                    class_1_probs /
                    math_ops.maximum(1.0 - class_1_probs, EPSILON), EPSILON))

        # labels might be None if we're doing prediction (which brings up the
        # question of why we force everything to adhere to a single model_fn).
        training_graph = None
        training_hooks = []
        if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
            with ops.control_dependencies([logits.op]):
                training_graph = control_flow_ops.group(
                    graph_builder.training_graph(features,
                                                 labels,
                                                 input_weights=weights,
                                                 num_trainers=num_trainers,
                                                 trainer_id=trainer_id),
                    state_ops.assign_add(contrib_framework.get_global_step(),
                                         1))

        # Put weights back in
        if weights is not None:
            features[weights_name] = weights

        # TensorForest's training graph isn't calculated directly from the loss
        # like many other models.
        def _train_fn(unused_loss):
            return training_graph

        model_ops = model_head.create_model_fn_ops(features=features,
                                                   labels=labels,
                                                   mode=mode,
                                                   train_op_fn=_train_fn,
                                                   logits=logits,
                                                   scope=head_scope)

        if report_feature_importances:
            training_hooks.append(
                TensorForestRunOpAtEndHook({
                    'feature_importances':
                    graph_builder.feature_importances()
                }))

        if early_stopping_rounds:
            training_hooks.append(
                TensorForestLossHook(
                    early_stopping_rounds,
                    early_stopping_loss_threshold=early_stopping_loss_threshold,
                    loss_op=model_ops.loss))

        model_ops.training_hooks.extend(training_hooks)

        if keys is not None:
            model_ops.predictions[keys_name] = keys

        return model_ops
Example #57
0
def log_ndtr(x, series_order=3, name="log_ndtr"):
  """Log Normal distribution function.

  For details of the Normal distribution function see `ndtr`.

  This function calculates `(log o ndtr)(x)` by either calling `log(ndtr(x))` or
  using an asymptotic series. Specifically:
  - For `x > upper_segment`, use the approximation `-ndtr(-x)` based on
    `log(1-x) ~= -x, x << 1`.
  - For `lower_segment < x <= upper_segment`, use the existing `ndtr` technique
    and take a log.
  - For `x <= lower_segment`, we use the series approximation of erf to compute
    the log CDF directly.

  The `lower_segment` is set based on the precision of the input:

  ```
  lower_segment = { -20,  x.dtype=float64
                  { -10,  x.dtype=float32
  upper_segment = {   8,  x.dtype=float64
                  {   5,  x.dtype=float32
  ```

  When `x < lower_segment`, the `ndtr` asymptotic series approximation is:

  ```
     ndtr(x) = scale * (1 + sum) + R_N
     scale   = exp(-0.5 x**2) / (-x sqrt(2 pi))
     sum     = Sum{(-1)^n (2n-1)!! / (x**2)^n, n=1:N}
     R_N     = O(exp(-0.5 x**2) (2N+1)!! / |x|^{2N+3})
  ```

  where `(2n-1)!! = (2n-1) (2n-3) (2n-5) ...  (3) (1)` is a
  [double-factorial](https://en.wikipedia.org/wiki/Double_factorial).


  Args:
    x: `Tensor` of type `float32`, `float64`.
    series_order: Positive Python `integer`. Maximum depth to
      evaluate the asymptotic expansion. This is the `N` above.
    name: Python string. A name for the operation (default="log_ndtr").

  Returns:
    log_ndtr: `Tensor` with `dtype=x.dtype`.

  Raises:
    TypeError: if `x.dtype` is not handled.
    TypeError: if `series_order` is a not Python `integer.`
    ValueError:  if `series_order` is not in `[0, 30]`.
  """
  if not isinstance(series_order, int):
    raise TypeError("series_order must be a Python integer.")
  if series_order < 0:
    raise ValueError("series_order must be non-negative.")
  if series_order > 30:
    raise ValueError("series_order must be <= 30.")

  with ops.name_scope(name, values=[x]):
    x = ops.convert_to_tensor(x, name="x")

    if x.dtype.as_numpy_dtype == np.float64:
      lower_segment = LOGNDTR_FLOAT64_LOWER
      upper_segment = LOGNDTR_FLOAT64_UPPER
    elif x.dtype.as_numpy_dtype == np.float32:
      lower_segment = LOGNDTR_FLOAT32_LOWER
      upper_segment = LOGNDTR_FLOAT32_UPPER
    else:
      raise TypeError("x.dtype=%s is not supported." % x.dtype)

    # The basic idea here was ported from py/scipy/special/cephes/ndtr.c.
    # We copy the main idea, with a few changes
    # * For x >> 1, and X ~ Normal(0, 1),
    #     Log[P[X < x]] = Log[1 - P[X < -x]] approx -P[X < -x],
    #     which extends the range of validity of this function.
    # * We use one fixed series_order for all of 'x', rather than adaptive.
    # * Our docstring properly reflects that this is an asymptotic series, not a
    #   Taylor series. We also provided a correct bound on the remainder.
    # * We need to use the max/min in the _log_ndtr_lower arg to avoid nan when
    #   x=0. This happens even though the branch is unchosen because when x=0
    #   the gradient of a select involves the calculation 1*dy+0*(-inf)=nan
    #   regardless of whether dy is finite. Note that the minimum is a NOP if
    #   the branch is chosen.
    return array_ops.where(
        math_ops.greater(x, upper_segment),
        -_ndtr(-x),  # log(1-x) ~= -x, x << 1
        array_ops.where(math_ops.greater(x, lower_segment),
                        math_ops.log(_ndtr(math_ops.maximum(x, lower_segment))),
                        _log_ndtr_lower(math_ops.minimum(x, lower_segment),
                                        series_order)))
def lifted_struct_loss(y_true, y_preds, margin=1.0):
    """Computes the lifted structured loss.

    The loss encourages the positive distances (between a pair of embeddings
    with the same labels) to be smaller than any negative distances (between a
    pair of embeddings with different labels) in the mini-batch in a way
    that is differentiable with respect to the embedding vectors.
    See: https://arxiv.org/abs/1511.06452.

    Args:
      labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
        multiclass integer labels.
      embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should not
        be l2 normalized.
      margin: Float, margin term in the loss definition.

    Returns:
      lifted_loss: tf.float32 scalar.
    """
    labels = y_true
    embeddings = y_preds
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    # assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pairwise_distances = pairwise_distance(embeddings)

    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    diff = margin - pairwise_distances
    mask = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    # Safe maximum: Temporarily shift negative distances
    #   above zero before taking max.
    #     this is to take the max only among negatives.
    row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
    row_negative_maximums = (math_ops.reduce_max(
        math_ops.multiply(diff - row_minimums, mask), 1, keepdims=True) +
                             row_minimums)

    # Compute the loss.
    # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
    #   where m_i is the max of alpha - negative D_i's.
    # This matches the Caffe loss layer implementation at:
    #   https://github.com/rksltnl/Caffe-Deep-Metric-Learning-CVPR16/blob/0efd7544a9846f58df923c8b992198ba5c355454/src/caffe/layers/lifted_struct_similarity_softmax_layer.cpp  # pylint: disable=line-too-long

    max_elements = math_ops.maximum(row_negative_maximums,
                                    array_ops.transpose(row_negative_maximums))
    diff_tiled = array_ops.tile(diff, [batch_size, 1])
    mask_tiled = array_ops.tile(mask, [batch_size, 1])
    max_elements_vect = array_ops.reshape(array_ops.transpose(max_elements),
                                          [-1, 1])

    loss_exp_left = array_ops.reshape(
        math_ops.reduce_sum(
            math_ops.multiply(math_ops.exp(diff_tiled - max_elements_vect),
                              mask_tiled),
            1,
            keepdims=True,
        ),
        [batch_size, batch_size],
    )

    loss_mat = max_elements + math_ops.log(loss_exp_left +
                                           array_ops.transpose(loss_exp_left))
    # Add the positive distance.
    loss_mat += pairwise_distances

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # *0.5 for upper triangular, and another *0.5 for 1/2 factor for loss^2.
    num_positives = math_ops.reduce_sum(mask_positives) / 2.0

    lifted_loss = math_ops.truediv(
        0.25 * math_ops.reduce_sum(
            math_ops.square(
                math_ops.maximum(math_ops.multiply(loss_mat, mask_positives),
                                 0.0))),
        num_positives,
        name='liftedstruct_loss',
    )
    return lifted_loss
Example #59
0
    def minimize(self, global_step=None, name=None):
        """Add operations to train a linear model by minimizing the loss function.

    Args:
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables passed in the constructor.
    """
        # Technically, the op depends on a lot more than the variables,
        # but we'll keep the list short.
        with name_scope(name, 'sdca/minimize'):
            sparse_example_indices = []
            sparse_feature_indices = []
            sparse_features_values = []
            for sf in self._examples['sparse_features']:
                sparse_example_indices.append(sf.example_indices)
                sparse_feature_indices.append(sf.feature_indices)
                # If feature values are missing, sdca assumes a value of 1.0f.
                if sf.feature_values is not None:
                    sparse_features_values.append(sf.feature_values)

            # pylint: disable=protected-access
            example_ids_hashed = gen_sdca_ops.sdca_fprint(
                internal_convert_to_tensor(self._examples['example_ids']))
            # pylint: enable=protected-access
            example_state_data = self._hashtable.lookup(example_ids_hashed)
            # Solver returns example_state_update, new delta sparse_feature_weights
            # and delta dense_feature_weights.

            sparse_weights = []
            sparse_indices = []
            # If we have partitioned variables, keep a few dictionaries of Tensors
            # around that we need for the assign_add after the op call to
            # gen_sdca_ops.sdca_optimizer().  These are keyed because we may have a
            # mix of partitioned and un-partitioned variables.
            num_partitions_by_var = {}
            p_assignments_by_var = {}
            gather_ids_by_var = {}
            for v_num, (w, i) in enumerate(
                    zip(self._slots['unshrinked_sparse_features_weights'],
                        sparse_feature_indices)):
                # Append the sparse_indices (in full-variable space).
                sparse_idx = math_ops.cast(
                    array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
                    dtypes.int64)
                sparse_indices.append(sparse_idx)
                if isinstance(w, list) or isinstance(
                        w, var_ops.PartitionedVariable):
                    num_partitions = len(w)
                    flat_ids = array_ops.reshape(sparse_idx, [-1])
                    # We use div partitioning, which is easiest to support downstream.
                    # Compute num_total_ids as the sum of dim-0 of w, then assign
                    # to partitions based on a constant number of ids per partition.
                    # Optimize if we already know the full shape statically.
                    dim_0_size = self._get_first_dimension_size_statically(
                        w, num_partitions)

                    if dim_0_size.value:
                        num_total_ids = constant_op.constant(
                            dim_0_size.value, flat_ids.dtype)
                    else:
                        dim_0_sizes = []
                        for p in range(num_partitions):
                            if w[p].get_shape()[0].value is not None:
                                dim_0_sizes.append(w[p].get_shape()[0].value)
                            else:
                                with ops.colocate_with(w[p]):
                                    dim_0_sizes.append(
                                        array_ops.shape(w[p])[0])
                        num_total_ids = math_ops.reduce_sum(
                            math_ops.cast(array_ops.stack(dim_0_sizes),
                                          flat_ids.dtype))
                    ids_per_partition = num_total_ids // num_partitions
                    extras = num_total_ids % num_partitions

                    p_assignments = math_ops.maximum(
                        flat_ids // (ids_per_partition + 1),
                        (flat_ids - extras) // ids_per_partition)

                    # Emulate a conditional using a boolean indicator tensor
                    new_ids = array_ops.where(
                        p_assignments < extras,
                        flat_ids % (ids_per_partition + 1),
                        (flat_ids - extras) % ids_per_partition)

                    # Cast partition assignments to int32 for use in dynamic_partition.
                    # There really should not be more than 2^32 partitions.
                    p_assignments = math_ops.cast(p_assignments, dtypes.int32)
                    # Partition list of ids based on assignments into num_partitions
                    # separate lists.
                    gather_ids = data_flow_ops.dynamic_partition(
                        new_ids, p_assignments, num_partitions)
                    # Add these into the dictionaries for use in the later update.
                    num_partitions_by_var[v_num] = num_partitions
                    p_assignments_by_var[v_num] = p_assignments
                    gather_ids_by_var[v_num] = gather_ids

                    # Gather the weights from each partition.
                    partition_gathered_weights = []
                    for p in range(num_partitions):
                        with ops.colocate_with(w[p]):
                            partition_gathered_weights.append(
                                array_ops.gather(w[p], gather_ids[p]))

                    # Stitch the weights back together in the same order they were before
                    # we dynamic_partitioned them.
                    condition_indices = data_flow_ops.dynamic_partition(
                        math_ops.range(array_ops.shape(new_ids)[0]),
                        p_assignments, num_partitions)
                    batch_gathered_weights = data_flow_ops.dynamic_stitch(
                        condition_indices, partition_gathered_weights)
                else:
                    w_as_tensor = internal_convert_to_tensor(w)
                    with ops.device(w_as_tensor.device):
                        batch_gathered_weights = array_ops.gather(
                            w_as_tensor, sparse_idx)
                sparse_weights.append(batch_gathered_weights)

            # pylint: disable=protected-access
            if compat.forward_compatible(year=2018, month=10, day=30):
                esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2(
                    sparse_example_indices,
                    sparse_feature_indices,
                    sparse_features_values,
                    self._convert_n_to_tensor(
                        self._examples['dense_features']),
                    internal_convert_to_tensor(
                        self._examples['example_weights']),
                    internal_convert_to_tensor(
                        self._examples['example_labels']),
                    sparse_indices,
                    sparse_weights,
                    self._convert_n_to_tensor(
                        self._slots['unshrinked_dense_features_weights']),
                    example_state_data,
                    loss_type=self._options['loss_type'],
                    l1=self._options['symmetric_l1_regularization'],
                    l2=self._symmetric_l2_regularization(),
                    num_loss_partitions=self._num_loss_partitions(),
                    num_inner_iterations=1,
                    adaptive=self._adaptive())
            else:
                esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
                    sparse_example_indices,
                    sparse_feature_indices,
                    sparse_features_values,
                    self._convert_n_to_tensor(
                        self._examples['dense_features']),
                    internal_convert_to_tensor(
                        self._examples['example_weights']),
                    internal_convert_to_tensor(
                        self._examples['example_labels']),
                    sparse_indices,
                    sparse_weights,
                    self._convert_n_to_tensor(
                        self._slots['unshrinked_dense_features_weights']),
                    example_state_data,
                    loss_type=self._options['loss_type'],
                    l1=self._options['symmetric_l1_regularization'],
                    l2=self._symmetric_l2_regularization(),
                    num_loss_partitions=self._num_loss_partitions(),
                    num_inner_iterations=1,
                    adaptative=self._adaptive())
            # pylint: enable=protected-access

            with ops.control_dependencies([esu]):
                update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
                # Update the weights before the proximal step.
                for v_num, (w, i, u) in enumerate(
                        zip(self._slots['unshrinked_sparse_features_weights'],
                            sparse_indices, sfw)):
                    if (isinstance(w, var_ops.PartitionedVariable)
                            or isinstance(w, list)):
                        update_ops += self._get_partitioned_update_ops(
                            v_num, num_partitions_by_var, p_assignments_by_var,
                            gather_ids_by_var, w, u, p_assignments,
                            num_partitions)
                    else:
                        update_ops.append(state_ops.scatter_add(w, i, u))
                for w, u in zip(
                        self._slots['unshrinked_dense_features_weights'], dfw):
                    if (isinstance(w, var_ops.PartitionedVariable)
                            or isinstance(w, list)):
                        split_updates = array_ops.split(
                            u,
                            num_or_size_splits=[
                                v.shape.as_list()[0] for v in w
                            ])
                        for v, split_update in zip(w, split_updates):
                            update_ops.append(
                                state_ops.assign_add(v, split_update))
                    else:
                        update_ops.append(state_ops.assign_add(w, u))
            if not global_step:
                return control_flow_ops.group(*update_ops)
            with ops.control_dependencies(update_ops):
                return state_ops.assign_add(global_step, 1, name=name).op
Example #60
0
 def _merge_function(self, inputs):
   output = inputs[0]
   for i in range(1, len(inputs)):
     output = math_ops.maximum(output, inputs[i])
   return output