Beispiel #1
0
def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
  """Maximum entropy loss for multiclass problems.

  Maximum entropy is a generalization of logistic loss for the case when more
  than 2 classes are present.

  Args:
    labels: Rank 2 (N, 1) or Rank 1 (N) tensor of per-example labels.
    weights: Rank 2 (N, 1) tensor of per-example weights.
    logits: Rank 2 (N, K) tensor of per-example predictions, K - num of
    classes.
    num_classes: number of classes in classification task. Used to expand label
    indices into one-hot encodings.
    eps: tolerance, used as a minimum possible value.

  Returns:
    loss: A Rank 2 (N, 1) tensor of per-example maxent loss
    update_op: An update operation to update the loss's internal state.
  """
  labels = math_ops.to_int64(labels)
  # If labels are of rank 1, make them rank 2.
  labels_shape = labels.get_shape()
  if len(labels_shape) != 2:
    labels = array_ops.expand_dims(labels, 1)
  # Labels are indices of classes, convert them to one hot encodings.
  target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
  labels = math_ops.reduce_sum(
      input_tensor=target_one_hot, reduction_indices=[1])
  labels = math_ops.to_float(labels)

  # Calculate softmax probabilities for each class.
  unnormalized_probs = math_ops.exp(logits)
  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True)
  softmax_predictions = math_ops.divide(unnormalized_probs,
                                        math_ops.add(normalizers, eps))

  # Pull out the probabilities for real label.
  probs_for_real_class = math_ops.reduce_sum(labels * softmax_predictions, 1)

  # Add handling for values near 0 and 1.
  zeros = array_ops.zeros_like(probs_for_real_class, dtype=logits.dtype) + eps
  one_minus_eps = array_ops.ones_like(
      probs_for_real_class, dtype=logits.dtype) - eps

  # Take maximum(eps, pred)
  cond = (probs_for_real_class >= eps)
  probs_for_real_class = array_ops.where(cond, probs_for_real_class, zeros)

  # Take minimum(1-eps, pred)
  cond = (probs_for_real_class <= 1 - eps)
  probs_for_real_class = array_ops.where(cond, probs_for_real_class,
                                         one_minus_eps)

  unweighted_loss = array_ops.expand_dims(-math_ops.log(probs_for_real_class),
                                          1)
  if weights is None:
    return unweighted_loss, control_flow_ops.no_op()
  else:
    return unweighted_loss * weights, control_flow_ops.no_op()
    def body(i, prev_c, prev_h, actions, log_probs):
      # pylint: disable=g-long-lambda
      signal = control_flow_ops.cond(
          math_ops.equal(i, 0),
          lambda: array_ops.tile(device_go_embedding,
                                 [self.hparams.num_children, 1]),
          lambda: embedding_ops.embedding_lookup(device_embeddings,
                                                 actions.read(i - 1))
      )
      if self.hparams.keep_prob is not None:
        signal = nn_ops.dropout(signal, self.hparams.keep_prob)
      next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias)
      query = math_ops.matmul(next_h, attn_w_2)
      query = array_ops.reshape(
          query, [self.hparams.num_children, 1, self.hparams.hidden_size])
      query = math_ops.tanh(query + attn_mem)
      query = array_ops.reshape(query, [
          self.hparams.num_children * self.num_groups, self.hparams.hidden_size
      ])
      query = math_ops.matmul(query, attn_v)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups])
      query = nn_ops.softmax(query)
      query = array_ops.reshape(query,
                                [self.hparams.num_children, self.num_groups, 1])
      query = math_ops.reduce_sum(attn_mem * query, axis=1)
      query = array_ops.concat([next_h, query], axis=1)
      logits = math_ops.matmul(query, device_softmax)
      logits /= self.hparams.temperature
      if self.hparams.tanh_constant > 0:
        logits = math_ops.tanh(logits) * self.hparams.tanh_constant
      if self.hparams.logits_std_noise > 0:
        num_in_logits = math_ops.cast(
            array_ops.size(logits), dtype=dtypes.float32)
        avg_norm = math_ops.divide(
            linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
        logits_noise = random_ops.random_normal(
            array_ops.shape(logits),
            stddev=self.hparams.logits_std_noise * avg_norm)
        logits = control_flow_ops.cond(
            self.global_step > self.hparams.stop_noise_step, lambda: logits,
            lambda: logits + logits_noise)

      if mode == "sample":
        next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
      elif mode == "greedy":
        next_y = math_ops.argmax(logits, 1)
      elif mode == "target":
        next_y = array_ops.slice(y, [0, i], [-1, 1])
      else:
        raise NotImplementedError
      next_y = math_ops.to_int32(next_y)
      next_y = array_ops.reshape(next_y, [self.hparams.num_children])
      actions = actions.write(i, next_y)
      log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=next_y)
      return i + 1, next_c, next_h, actions, log_probs
Beispiel #3
0
def _safe_div(numerator, denominator, name):
    """Divides two values, returning 0 if the denominator is <= 0.
    Args:
      numerator: A real `Tensor`.
      denominator: A real `Tensor`, with dtype matching `numerator`.
      name: Name for the returned op.
    Returns:
      0 if `denominator` <= 0, else `numerator` / `denominator`
    """
    return tf.where(math_ops.greater(denominator, 0),
                    math_ops.divide(numerator, denominator),
                    tf.zeros_like(numerator),
                    name=name)
 def testFloorDivGrad(self):
   with self.test_session():
     a = variables.Variable(2.)
     b = variables.Variable(4.)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
       self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
       c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
       self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
       c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
       self.assertAllEqual([None if x is None else x.eval()
                            for x in c_grad], [None, None])
Beispiel #5
0
def _SegmentMinOrMaxGrad(op, grad):
  """ Gradient for SegmentMin and SegmentMax. """
  zeros = array_ops.zeros_like(op.inputs[0], dtype=op.inputs[0].dtype)
  # Get the number of selected (minimum or maximum) elements in each segment.
  gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
  is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
  num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
                                      op.inputs[1])
  # Compute the gradient for each segment. The gradient for the ith segment is
  # divided evenly among the selected elements in that segment.
  weighted_grads = math_ops.divide(grad, num_selected)
  gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
  return array_ops.where(is_selected, gathered_grads, zeros), None
 def testFloorDivGrad(self):
   with self.test_session():
     a = variables.Variable(2.)
     b = variables.Variable(4.)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
       self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
       c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
       self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
       c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
       self.assertAllEqual([None if x is None else x.eval()
                            for x in c_grad], [None, None])
Beispiel #7
0
    def __call__(self, step):
        with ops.name_scope(self.name, "CyclicLearningRate",
                            [self.learning_rate, step]) as name:
            learning_rate = ops.convert_to_tensor(self.learning_rate,
                                                  name="learning_rate")
            dtype = learning_rate.dtype
            step = math_ops.cast(step, dtype)
            step_size = math_ops.cast(self.step_size, dtype)
            max_lr = math_ops.cast(self.max_lr, dtype)

            # computing: cycle = floor( 1 + step / ( 2 * step_size ) )
            double_step = math_ops.multiply(2., step_size)
            global_div_double_step = math_ops.divide(step, double_step)
            cycle = math_ops.floor(math_ops.add(1., global_div_double_step))

            # computing: x = abs( step / step_size – 2 * cycle + 1 )
            double_cycle = math_ops.multiply(2., cycle)
            global_div_step = math_ops.divide(step, step_size)
            tmp = math_ops.subtract(global_div_step, double_cycle)
            x = math_ops.abs(math_ops.add(1., tmp))

            # computing: clr = learning_rate + ( max_lr – learning_rate ) * max( 0, 1 - x )
            a1 = math_ops.maximum(0., math_ops.subtract(1., x))
            a2 = math_ops.subtract(max_lr, learning_rate)
            clr = math_ops.multiply(a1, a2)

            if self.mode == 'triangular2':
                clr = math_ops.divide(
                    clr,
                    math_ops.cast(
                        math_ops.pow(2, math_ops.cast(cycle - 1, tf.int32)),
                        tf.float32))
            if self.mode == 'exp_range':
                gamma = math_ops.cast(self.gamma, dtype)
                clr = math_ops.multiply(math_ops.pow(gamma, step), clr)
            #if self.mode == 'cosine':

            return math_ops.add(clr, learning_rate, name=name)
        def cyclic_lr():

            double_step = math_ops.multiply(2., step_size)
            global_div_double_step = math_ops.divide(global_step, double_step)
            cycle = math_ops.floor(math_ops.add(1., global_div_double_step))

            double_cycle = math_ops.multiply(2., cycle)
            global_div_step = math_ops.divide(global_step, step_size)
            tmp = math_ops.subtract(global_div_step, double_cycle)
            x = math_ops.abs(math_ops.add(1., tmp))

            a1 = math_ops.maximum(0., math_ops.subtract(1., x))
            a2 = math_ops.subtract(max_lr, learning_rate)
            clr = math_ops.multiply(a1, a2)
            if mode == 'triangular2':
                clr = math_ops.divide(
                    clr,
                    math_ops.cast(
                        math_ops.pow(2, math_ops.cast(cycle - 1, tf.int32)),
                        tf.float32))
            if mode == 'exp_range':
                clr = math_ops.multiply(math_ops.pow(gamma, global_step), clr)
            return math_ops.add(clr, learning_rate, name=name)
Beispiel #9
0
 def calculate_bboxes_intersection(self, bbox_ref, bboxes):
     bboxes = tf.transpose(bboxes)
     bbox_ref = tf.transpose(bbox_ref)
     int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
     int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
     int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
     int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
     inter_vol = tf.maximum(int_ymax - int_ymin, 0.) * tf.maximum(
         int_xmax - int_xmin, 0.)
     bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])
     scores = tf.where(math_ops.greater(bboxes_vol, 0),
                       math_ops.divide(inter_vol, bboxes_vol),
                       tf.zeros_like(inter_vol))
     return scores
Beispiel #10
0
def ignore(x, binary_tensor, name=None):
    with ops.name_scope(name, "ignore", [x]) as name:
        x = ops.convert_to_tensor(x, name="x")
        keep_ratio = math_ops.divide(
            math_ops.reduce_sum(binary_tensor),
            math_ops.reduce_prod(
                array_ops.shape(binary_tensor, out_type=dtypes.float32)))
        keep_ratio.get_shape().assert_is_compatible_with(tensor_shape.scalar())
        with tf.Session() as sess:

            print(keep_ratio.eval(session=sess))
        ret = math_ops.div(x, keep_ratio) * binary_tensor
        ret.set_shape(x.get_shape())
        return ret
Beispiel #11
0
def _optimal_step_size(last_step,
                       error_ratio,
                       safety=0.9,
                       ifactor=10.0,
                       dfactor=0.2,
                       order=5):
    """Calculate the optimal size for the next Runge-Kutta step."""

    error_ratio = math_ops.cast(error_ratio, last_step.dtype)
    exponent = math_ops.cast(1 / order, last_step.dtype)
    factor = math_ops.maximum(
        1 / ifactor,
        math_ops.minimum(error_ratio**exponent / safety, 1 / dfactor))
    return math_ops.divide(last_step, factor)
Beispiel #12
0
def gen_crossentropy(y_true, y_pred, q=0.7, k=-1.0):
    # Filter true values ("y_true") in "y_pred"
    y_ok = array_ops.boolean_mask(y_pred, gen_math_ops.equal(y_true, 1))
    # Conversion for Float64 for valid operations in TensorFlow
    um = np.float64(1.)
    q = np.float64(q)

    if k == -1:  # cross entropy loss
        # mean[ (1-y_ok^q)/q ]
        return K.mean(math_ops.divide(
            math_ops.subtract(um, math_ops.pow(y_ok, q)), q),
                      axis=-1)
    else:  # truncated cross entropy loss

        k = np.float64(k)
        # if y_ok < k
        #     [ (1-k^q)/q    ]  (no broadcasting in Where())
        #     [ (1-y_ok^q)/q ]
        vfunct = array_ops.where(
            gen_math_ops.less_equal(y_ok, k),
            gen_array_ops.fill(array_ops.shape(y_ok), (um - k**q) / q),
            math_ops.divide(math_ops.subtract(um, math_ops.pow(y_ok, q)), q))
        return K.mean(vfunct, axis=-1)  # mean [ above values ]
Beispiel #13
0
def safe_divide(numerator, denominator, name):
    """Divides two values, returning 0 if the denominator is <= 0.
    Args:
      numerator: A real `Tensor`.
      denominator: A real `Tensor`, with dtype matching `numerator`.
      name: Name for the returned op.
    Returns:
      0 if `denominator` <= 0, else `numerator` / `denominator`
    """
    return tf.where(
        math_ops.greater(denominator, 0),
        math_ops.divide(numerator, denominator),
        tf.zeros_like(numerator),
        name=name)
Beispiel #14
0
 def cyclic_lr():
     """Helper to recompute learning rate; most helpful in eager-mode."""
     # computing: cycle = floor( 1 + global_step / ( 2 * step_size ) )
     double_step = math_ops.multiply(2., step_size)
     global_div_double_step = math_ops.divide(global_step, double_step)
     cycle = math_ops.floor(math_ops.add(1., global_div_double_step))
     # computing: x = abs( global_step / step_size – 2 * cycle + 1 )
     double_cycle = math_ops.multiply(2., cycle)
     global_div_step = math_ops.divide(global_step, step_size)
     tmp = math_ops.subtract(global_div_step, double_cycle)
     x = math_ops.abs(math_ops.add(1., tmp))
     # computing: clr = learning_rate + ( max_lr – learning_rate ) * max( 0, 1 - x )
     a1 = math_ops.maximum(0., math_ops.subtract(1., x))
     a2 = math_ops.subtract(max_lr, learning_rate)
     clr = math_ops.multiply(a1, a2)
     if mode == 'triangular2':
         clr = math_ops.divide(
             clr,
             math_ops.cast(
                 math_ops.pow(2, math_ops.cast(cycle - 1, tf.int32)),
                 tf.float32))
     if mode == 'exp_range':
         clr = math_ops.multiply(math_ops.pow(gamma, global_step), clr)
     return math_ops.add(clr, learning_rate, name=name)
Beispiel #15
0
    def __call__(self, step):
        with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:
            initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
                self.initial_learning_rate, name="initial_learning_rate")
            dtype = initial_learning_rate.dtype
            decay_steps = math_ops.cast(self.decay_steps, dtype)
            decay_rate = math_ops.cast(self.decay_rate, dtype)

            global_step_recomp = math_ops.cast(step, dtype)
            p = global_step_recomp / decay_steps
            if self.staircase:
                p = math_ops.floor(p)
            const = math_ops.cast(constant_op.constant(1), dtype)
            denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
            return math_ops.divide(initial_learning_rate, denom, name=name)
    def make_grouping_predictions(self, input_layer, reuse=None):
        """model that predicts grouping (grouping_actions).

    Args:
      input_layer: group_input_layer
      reuse: reuse

    Returns:
       grouping_actions: actions
       grouping_log_probs: log probabilities corresponding to actions
    """
        with variable_scope.variable_scope(self.hparams.name, reuse=True):
            # input_layer: tensor of size [1, num_ops, hidden_size]
            w_grouping_ff = variable_scope.get_variable("w_grouping_ff")
            w_grouping_softmax = variable_scope.get_variable(
                "w_grouping_softmax")

        batch_size = array_ops.shape(input_layer)[0]
        embedding_dim = array_ops.shape(input_layer)[2]

        reshaped = array_ops.reshape(
            input_layer, [batch_size * self.num_ops, embedding_dim])
        ff_output = math_ops.matmul(reshaped, w_grouping_ff)
        logits = math_ops.matmul(ff_output, w_grouping_softmax)
        if self.hparams.logits_std_noise > 0:
            num_in_logits = math_ops.cast(array_ops.size(logits),
                                          dtype=dtypes.float32)
            avg_norm = math_ops.divide(linalg_ops.norm(logits),
                                       math_ops.sqrt(num_in_logits))
            logits_noise = random_ops.random_normal(
                array_ops.shape(logits),
                stddev=self.hparams.logits_std_noise * avg_norm)
            logits = control_flow_ops.cond(
                self.global_step > self.hparams.stop_noise_step,
                lambda: logits, lambda: logits + logits_noise)
        logits = array_ops.reshape(
            logits, [batch_size * self.num_ops, self.num_groups])
        actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
        actions = math_ops.to_int32(actions)
        actions = array_ops.reshape(actions, [batch_size, self.num_ops])
        action_label = array_ops.reshape(actions, [-1])
        log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=action_label)
        log_probs = array_ops.reshape(log_probs, [batch_size, -1])
        log_probs = math_ops.reduce_sum(log_probs, 1)
        grouping_actions = actions
        grouping_log_probs = log_probs
        return grouping_actions, grouping_log_probs
Beispiel #17
0
    def __call__(self, step):
        """
        return a float(learning rate)
        """

        # decrease linearly
        steprate = math_ops.abs(math_ops.divide(
            math_ops.subtract(self.finallr, self.initlr), self.nsteps))

        lr = math_ops.subtract(self.initlr, math_ops.multiply(
            steprate, math_ops.subtract(step, self.shiftstep)))

        pred = math_ops.greater(step, self.shiftstep)
        lr = control_flow_ops.cond(pred, lambda: lr, lambda: self.initlr)

        return lr
Beispiel #18
0
  def __call__(self, step):
    with tf.name_scope(self.name or "Stolera") as name:
      dtype = tf.dtypes.float32

      initial_learning_rate = tf.convert_to_tensor(self.initial_learning_rate, dtype=dtype, name="initial_learning_rate")

      sigma = math_ops.cast(self.sigma, dtype)
      t_step = math_ops.cast(step, dtype)
      # t_step = math_ops.multiply(t_step, t_step)
      t_step = math_ops.add(t_step, tf.constant(1, dtype=dtype))
      Z_t = tf.random.normal([1], mean=0.0, stddev=1.0, dtype=dtype)

      term_a = math_ops.divide(Z_t[0], t_step)
      term_b = math_ops.multiply(sigma, term_a)
      term_c = math_ops.subtract(initial_learning_rate, term_b, name=name)
      return term_c
Beispiel #19
0
def _MinOrMaxGrad(op, grad):
  """Gradient for Min or Max. Amazingly it's precisely the same code."""
  input_shape = array_ops.shape(op.inputs[0])
  output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
  y = op.outputs[0]
  y = array_ops.reshape(y, output_shape_kept_dims)
  grad = array_ops.reshape(grad, output_shape_kept_dims)

  # Compute the number of selected (maximum or minimum) elements in each
  # reduction dimension. If there are multiple minimum or maximum elements
  # then the gradient will be divided between them.
  indicators = math_ops.cast(math_ops.equal(y, op.inputs[0]), grad.dtype)
  num_selected = array_ops.reshape(
      math_ops.reduce_sum(indicators, op.inputs[1]), output_shape_kept_dims)

  return [math_ops.divide(indicators, num_selected) * grad, None]
Beispiel #20
0
        def apply_dropout():
            keep_prob = 1 - rate
            # uniform [keep_prob, 1.0 + keep_prob)
            random_tensor = keep_prob
            random_tensor += random_ops.random_uniform(noise_shape,
                                                       seed=seed,
                                                       dtype=x.dtype)
            # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
            binary_tensor = math_ops.floor(random_tensor)

            # save binary tensor to variable
            assign_op = binary_tensor_var.assign(binary_tensor)
            with tf.control_dependencies([assign_op]):
                ret = math_ops.divide(x, keep_prob) * binary_tensor
                #ret = tf.Print(ret,["apply dropout and save", _is_reuse_binary_tensor_var])
            return ret
  def make_grouping_predictions(self, input_layer, reuse=None):
    """model that predicts grouping (grouping_actions).

    Args:
      input_layer: group_input_layer
      reuse: reuse

    Returns:
       grouping_actions: actions
       grouping_log_probs: log probabilities corresponding to actions
    """
    with variable_scope.variable_scope(self.hparams.name, reuse=True):
      # input_layer: tensor of size [1, num_ops, hidden_size]
      w_grouping_ff = variable_scope.get_variable("w_grouping_ff")
      w_grouping_softmax = variable_scope.get_variable("w_grouping_softmax")

    batch_size = array_ops.shape(input_layer)[0]
    embedding_dim = array_ops.shape(input_layer)[2]

    reshaped = array_ops.reshape(input_layer,
                                 [batch_size * self.num_ops, embedding_dim])
    ff_output = math_ops.matmul(reshaped, w_grouping_ff)
    logits = math_ops.matmul(ff_output, w_grouping_softmax)
    if self.hparams.logits_std_noise > 0:
      num_in_logits = math_ops.cast(
          array_ops.size(logits), dtype=dtypes.float32)
      avg_norm = math_ops.divide(
          linalg_ops.norm(logits), math_ops.sqrt(num_in_logits))
      logits_noise = random_ops.random_normal(
          array_ops.shape(logits),
          stddev=self.hparams.logits_std_noise * avg_norm)
      logits = control_flow_ops.cond(
          self.global_step > self.hparams.stop_noise_step, lambda: logits,
          lambda: logits + logits_noise)
    logits = array_ops.reshape(logits,
                               [batch_size * self.num_ops, self.num_groups])
    actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed)
    actions = math_ops.to_int32(actions)
    actions = array_ops.reshape(actions, [batch_size, self.num_ops])
    action_label = array_ops.reshape(actions, [-1])
    log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=action_label)
    log_probs = array_ops.reshape(log_probs, [batch_size, -1])
    log_probs = math_ops.reduce_sum(log_probs, 1)
    grouping_actions = actions
    grouping_log_probs = log_probs
    return grouping_actions, grouping_log_probs
Beispiel #22
0
    def sample(self, time, outputs, state, name=None):
        """sample for SampleEmbeddingHelper."""
        del time, state  # unused by sample_fn
        # Outputs are logits, we sample instead of argmax (greedy).
        if not isinstance(outputs, ops.Tensor):
            raise TypeError("Expected outputs to be a single Tensor, got: %s" %
                            type(outputs))
        if self._softmax_temperature is None:
            logits = outputs
        else:
            #logits = outputs / self._softmax_temperature
            logits = math_ops.divide(outputs, self._softmax_temperature)

        sample_id_sampler = categorical.Categorical(logits=logits)
        sample_ids = sample_id_sampler.sample(seed=self._seed)

        return sample_ids
Beispiel #23
0
def _UnsortedSegmentMinOrMaxGrad(op, grad):
  """ Gradient for UnsortedSegmentMin and UnsortedSegmentMax. """
  # Get the number of selected (minimum or maximum) elements in each segment.
  gathered_outputs, zero_clipped_indices, is_positive = \
      _GatherDropNegatives(op.outputs[0], op.inputs[1])
  is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
  is_selected = math_ops.logical_and(is_selected, is_positive)
  num_selected = math_ops.unsorted_segment_sum(
      math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
  # Compute the gradient for each segment. The gradient for the ith segment is
  # divided evenly among the selected elements in that segment.
  weighted_grads = math_ops.divide(grad, num_selected)
  gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
                                              zero_clipped_indices,
                                              is_positive)
  zeros = array_ops.zeros_like(gathered_grads)
  return array_ops.where(is_selected, gathered_grads, zeros), None, None
Beispiel #24
0
def _optimal_step_size(last_step,
                       error_ratio,
                       safety=0.9,
                       ifactor=10.0,
                       dfactor=0.2,
                       order=5):
    """Calculate the optimal size for the next Runge-Kutta step."""

    error_ratio = math_ops.cast(error_ratio, last_step.dtype)
    dfactor = tf.cond(error_ratio < 1,
                      lambda: tf.constant(1, dtype=tf.float64),
                      lambda: dfactor)
    error_ratio = tf.cast(tf.math.sqrt(error_ratio), last_step.dtype)
    exponent = math_ops.cast(1 / order, last_step.dtype)
    factor = math_ops.maximum(
        1 / ifactor,
        math_ops.minimum(error_ratio**exponent / safety, 1 / dfactor))
    #print('FACTOR', factor)
    return math_ops.divide(last_step, factor)
def mean_squared_norm_loss(
        labels,
        predictions,
        weights=1.0,
        scope=None,
        loss_collection=ops.GraphKeys.LOSSES,
        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
    with ops.name_scope(scope, "mean_squared_norm_loss",
                        (predictions, labels, weights)) as scope:
        predictions = math_ops.to_float(predictions)
        labels = math_ops.to_float(labels)
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        divisor = tf.maximum(labels, 1.0)
        error = math_ops.square(
            math_ops.divide(math_ops.subtract(predictions, labels), divisor))
        return tf.losses.compute_weighted_loss(error,
                                               weights,
                                               scope,
                                               loss_collection,
                                               reduction=reduction)
Beispiel #26
0
  def __call__(self, step):
    with tf.name_scope(self.name or "Dilera") as name:
      dtype = tf.dtypes.float32

      initial_learning_rate = tf.convert_to_tensor(self.initial_learning_rate, dtype=dtype, name="initial_learning_rate")

      sigma = math_ops.cast(self.sigma, dtype)
      t_step = math_ops.cast(step, dtype)
      # t_step = math_ops.multiply(t_step, t_step)
      dt = tf.constant(1, dtype=dtype)
      t_step = math_ops.add(t_step, tf.constant(1, dtype=dtype))
      Z_t = tf.random.normal([1], mean=0.0, stddev=1.0, dtype=dtype)

      Z_over_T = math_ops.divide(Z_t[0], t_step)
      Sigma_Z_over_T = math_ops.multiply(sigma, Z_over_T)
      Sigma_Z_sqrtDt_over_T = math_ops.multiply(Sigma_Z_over_T, math_ops.sqrt(dt))

      eta_dT = math_ops.multiply(initial_learning_rate, dt)
      newLearningRate  = math_ops.subtract(eta_dT, Sigma_Z_sqrtDt_over_T, name=name)
      return newLearningRate
Beispiel #27
0
def softmax(logits: ragged_tensor.Ragged, axis=None, name=None):
  """Computes softmax activations.

  Used for multi-class predictions. The sum of all outputs generated by softmax
  is 1.

  This function performs the equivalent of

      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)

  Example usage:

  >>> softmax = tf.nn.softmax([-1, 0., 1.])
  >>> softmax
  <tf.Tensor: shape=(3,), dtype=float32,
  numpy=array([0.09003057, 0.24472848, 0.66524094], dtype=float32)>
  >>> sum(softmax)
  <tf.Tensor: shape=(), dtype=float32, numpy=1.0>

  Args:
    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
      `float32`, `float64`.
    axis: The dimension softmax would be performed on. The default is -1 which
      indicates the last dimension.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type and shape as `logits`.

  Raises:
    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
      dimension of `logits`.
  """
  if axis is None:
    axis = -1

  with ops.name_scope(name, 'RaggedSoftmax', [logits]) as name:
    max_input = reduce_max(logits, axis=axis, keepdims=True)
    logits_exp = math_ops.exp(math_ops.subtract(logits, max_input))
    denominator = reduce_sum(logits_exp, axis=axis, keepdims=True)
    return math_ops.divide(logits_exp, denominator)
Beispiel #28
0
def normalize_for_graph_lstm(tensor):
    """Normalizes Tensor to range [-0.5, 0.5].

    Scales a Tensor uniformly to fit within [-0.5, 0.5]^n. Additionally,
      each dimension is shifted to be centred around [0]^n i.e. the origin,
      in a way that data extends the same distance in positive and negative
      direction. In other words, the mean between maximum and minimum value
      of each dimension is shifted to zero. The undo_scaling op undoes
      scaling, but does not undo shifting. The unnormalize op does both,
      but is currently unused.

    Returns: The normalized Tensor, and an op to undo normalization.

    Example usage:
    ```
    normalized_tensor, undo_scaling = normalize_for_graph_lstm(input_tensor)
    normalized_output_tensor = some_op(normalized_tensor)
    output_tensor = undo_scaling(normalized_output_tensor)
    ```
    """
    # tensor is normalized to range[-0.5, 0.5]
    # this function assumes tensors with shape [ batch_size, number_of_nodes, output_size ]
    assert (len(tensor.shape) == 3)
    # compute maximum and minimum joint position value in each dimension
    max_dim = math_ops.reduce_max(tensor, axis=1, keepdims=True)
    min_dim = math_ops.reduce_min(tensor, axis=1, keepdims=True)
    diff_dim = math_ops.subtract(max_dim, min_dim)
    # get normalizing factor as maximum difference within all dimensions
    max_diff = math_ops.reduce_max(diff_dim, axis=2, keepdims=True)
    normalized_tensor = math_ops.divide(tensor - min_dim - diff_dim / 2,
                                        max_diff)

    # return output rescaled and shifted to original position
    def unnormalize(tensor):
        return math_ops.multiply(tensor, max_diff) + diff_dim / 2 + min_dim

    # return output only rescaled, centered around 0
    def undo_scaling(tensor):
        return math_ops.multiply(tensor, max_diff)

    return normalized_tensor, undo_scaling
def lr_annealing(learning_rate, current_epoch, total_epochs, alpha, beta, name=None):
    """
    Applies learning rate annealing to the initial learning rate
    return lr_p = learning_rate * (1 + alpha * (current_epoch/global_step))^(-beta)

    Args:   learning_rate:
            global_step: number of iterations
            alpha:
            beta:
    """
    with ops.name_scope(name, "Lr_Annealing", [learning_rate, current_epoch, total_epochs, alpha, beta]) as name:
        learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
        dtype = learning_rate.dtype
        current_epoch = math_ops.cast(current_epoch, dtype)
        total_epochs = math_ops.cast(total_epochs, dtype)
        alpha = math_ops.cast(alpha, dtype)
        beta = math_ops.cast(beta, dtype)
        epoch_ratio = math_ops.divide(current_epoch, total_epochs)
        base = math_ops.multiply(alpha, epoch_ratio)
        base = math_ops.add(1., base)
        return math_ops.multiply(learning_rate, math_ops.pow(base, -beta), name=name)
def cosine_similarity(vec_a, vec_b):
  '''
  Computes the cosine similarity between tensors 'vec_a' and 'vec_b'. This
  method assumes that rank(vec_a) = rank(vec_b) = 1.

  Arguments:
    vec_a - Rank(1) tensor.
    vec_b - Rank(1) tensor.

  Returns:
    cos_sim - Rank(0) tensor containing cosine similarities between tensors
      'vec_a' and 'vec_b'.
  '''

  dot = math_ops.reduce_sum(vec_a*vec_b, axis=1)

  norm_a = linalg_ops.norm(vec_a, ord=2, axis=1)
  norm_b = linalg_ops.norm(vec_b, ord=2, axis=1)

  # Some padding is added to the denominator to prevent 0/0 errors.
  cos_sim = math_ops.divide(dot, math_ops.add(norm_a*norm_b, 1e-8))

  return cos_sim
Beispiel #31
0
def signal_to_noise(y_true, y_pred, mode='snr', data_format=None, epsilon=1e-8):
    '''Signal-to-noise ratio. (metric)
    Calculate the signal-to-noise ratio. It support different modes.
    Arguments:
        mode:        (1)  snr: mean [ y_true^2 / (y_pred - y_true)^2 ]
                     (2) psnr: mean [ max( y_true^2 ) / (y_pred - y_true)^2 ]
        data_format: 'channels_first' or 'channels_last'. The default setting is generally
                     'channels_last' like other tf.keras APIs.
        epsilon:      used for avoid zero division.
    Input:
        y_true: label, tensor in any shape.
        y_pred: prediction, tensor in any shape.
    Output:
        scalar, the mean SNR.
    '''
    get_reduced_axes = get_channels(y_true, data_format)
    if mode.casefold() == 'psnr':
        signal = math_ops.reduce_max(gen_math_ops.square(y_true), axis=get_reduced_axes)
    else:
        signal = math_ops.reduce_sum(gen_math_ops.square(y_true), axis=get_reduced_axes)
    noise = math_ops.reduce_sum(gen_math_ops.square(y_true - y_pred), axis=get_reduced_axes) + epsilon
    coeff = (10.0/2.3025851) # 10/log_e(10)
    return coeff*math_ops.reduce_mean(gen_math_ops.log(math_ops.divide(signal, noise)))
Beispiel #32
0
def _NthElementGrad(op, grad):
  """Return the gradients for NthElement.

  Args:
    op: The NthElementOp for which we need to generate gradients.
    grad: Tensor. The gradients passed to the NthElementOp

  Returns:
    A list of two tensors, the first being the gradient w.r.t. the input,
    the second being the gradient w.r.t. the N (None).
  """
  input = op.inputs[0]  # pylint: disable=redefined-builtin
  output = op.outputs[0]

  # Compute the number of elements which equal to output in each reduction
  # dimension. If there are multiple elements then the gradient will be
  # divided between them.
  indicators = math_ops.cast(
      math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype)

  grad = array_ops.expand_dims(grad, -1)
  num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)

  return [math_ops.divide(indicators, num_selected) * grad, None]
Beispiel #33
0
  def _finish(self, state):
    var_dtype = self._variables[0].dtype.base_dtype
    # Update global step.
    global_step = self._get_global_step(state)
    update_global_step = state_ops.assign_add(global_step, 1.)

    # Update the first moment estimate.
    beta1 = state.get_hyper("beta1", dtype=var_dtype)
    moment1 = self._get_moment1(state)
    flat_grad = self._get_flat_grad(state)
    # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t
    update_moment1 = moment1.assign(beta1 * moment1 + (1. - beta1) * flat_grad)

    # Update the gradient buffer.
    window = state.get_hyper("window")
    grad_buffer = self._get_grad_buffer(state)
    next_grad_index = math_ops.floormod(
        math_ops.to_int32(update_global_step - 1.), window)
    # grad_buffer[(t-1) % window] := moment1_t
    update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index,
                                                  update_moment1)

    # Compute the update step.
    eps = state.get_hyper("eps", dtype=var_dtype)
    svd_eps = state.get_hyper("svd_eps", dtype=var_dtype)
    sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype)
    lr = state.get_hyper("lr", dtype=var_dtype)
    denom = math_ops.sqrt(
        math_ops.minimum(
            ops.convert_to_tensor(update_global_step),
            ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype))))
    moment1_2d = array_ops.expand_dims(update_moment1, -1)

    # m = grad_buffer^T / sqrt(min(t, window))
    # m has shape [model dimension, window], where model dimension is the sum
    # of the dimensions of the flattened variables.
    m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom))

    # sigma, u, _ = SVD(m^Tm + I * svd_eps)
    mm = math_ops.matmul(m, m, transpose_a=True)
    damping = math_ops.cast(linalg_ops.eye(window), dtype=var_dtype) * svd_eps
    sigma, u, _ = linalg_ops.svd(mm + damping)
    sigma_sqrt = math_ops.sqrt(sigma)
    sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt)

    # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3
    # We add sigma_eps to alleviate numerical instability.
    # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T.
    sigma_sqrt_inv = math_ops.divide(
        math_ops.cast(1.0, dtype=var_dtype),
        math_ops.pow(sigma_sqrt + sigma_eps, 3))

    # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the
    # inversion of a model dimension by model dimension matrix is needed. To
    # speed up this computation we calculate the following instead:
    # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1.
    new_step = array_ops.expand_dims(
        array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1)
    head = math_ops.matmul(
        m,
        math_ops.matmul(
            u,
            math_ops.matmul(
                array_ops.diag(sigma_sqrt_inv),
                math_ops.matmul(
                    u,
                    math_ops.matmul(m, moment1_2d, transpose_a=True),
                    transpose_a=True))))

    # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for
    # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using
    # Woodbury's identity.
    # For full derivation please see paper at
    # https://arxiv.org/pdf/1806.02958.pdf
    tail = moment1_2d - math_ops.matmul(
        m,
        math_ops.matmul(
            u,
            math_ops.matmul(
                array_ops.diag(
                    math_ops.divide(math_ops.cast(1.0, dtype=var_dtype),
                                    sigma)),
                math_ops.matmul(
                    u,
                    math_ops.matmul(m, moment1_2d, transpose_a=True),
                    transpose_a=True))))
    scaled_tail = math_ops.divide(tail, sigma_sqrt_min)

    update_new_step = control_flow_ops.cond(
        sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail),
        lambda: math_ops.add(new_step, head))

    # Update each variable.
    update_step = []
    for var in self._variables:
      dim = self.shape_dict[var.name]
      start_index = self.index_dict[var.name]
      end_index = start_index + dim
      var_update_correct_shape = array_ops.reshape(
          update_new_step[start_index:end_index], var.get_shape())
      var_updated = state_ops.assign_sub(var, lr * var_update_correct_shape)
      update_step.append(var_updated)

    return control_flow_ops.group(update_step)
 def testComplexDiv(self):
   foo = array_ops.constant([1. + 3.j])
   with self.test_session():
     _ = math_ops.divide(foo, 1.).eval()
     _ = math_ops.div(foo, 2.).eval()
 def testDivideName(self):
   with self.test_session():
     op = math_ops.divide(
         array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
     self.assertEqual(op.name, "my_cool_divide:0")
Beispiel #36
0
def weighted_moving_average(value,
                            decay,
                            weight,
                            truediv=True,
                            collections=None,
                            name=None):
  """Compute the weighted moving average of `value`.

  Conceptually, the weighted moving average is:
    `moving_average(value * weight) / moving_average(weight)`,
  where a moving average updates by the rule
    `new_value = decay * old_value + (1 - decay) * update`
  Internally, this Op keeps moving average variables of both `value * weight`
  and `weight`.

  Args:
    value: A numeric `Tensor`.
    decay: A float `Tensor` or float value.  The moving average decay.
    weight:  `Tensor` that keeps the current value of a weight. Shape should be
      able to multiply `value`.
    truediv:  Boolean, if `True`, dividing by `moving_average(weight)` is
      floating point division.  If `False`, use division implied by dtypes.
    collections:  List of graph collections keys to add the internal variables
      `value * weight` and `weight` to. Defaults to
      `[GraphKeys.GLOBAL_VARIABLES]`.
    name: Optional name of the returned operation. Defaults to
      "WeightedMovingAvg".

  Returns:
    An Operation that updates and returns the weighted moving average.
  """
  # Unlike assign_moving_average, the weighted moving average doesn't modify
  # user-visible variables. It is the ratio of two internal variables, which are
  # moving averages of the updates.  Thus, the signature of this function is
  # quite different than assign_moving_average.
  if collections is None:
    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
  with variable_scope.variable_scope(name, "WeightedMovingAvg",
                                     [value, weight, decay]) as scope:
    value_x_weight_var = variable_scope.get_variable(
        "value_x_weight",
        shape=value.get_shape(),
        dtype=value.dtype,
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        collections=collections)
    weight_var = variable_scope.get_variable(
        "weight",
        shape=weight.get_shape(),
        dtype=weight.dtype,
        initializer=init_ops.zeros_initializer(),
        trainable=False,
        collections=collections)
    numerator = assign_moving_average(
        value_x_weight_var, value * weight, decay, zero_debias=False)
    denominator = assign_moving_average(
        weight_var, weight, decay, zero_debias=False)

    if truediv:
      return math_ops.truediv(numerator, denominator, name=scope.name)
    else:
      return math_ops.divide(numerator, denominator, name=scope.name)
Beispiel #37
0
def _embedded_lattices(calibrated_input_tensor,
                       input_dim,
                       output_dim,
                       interpolation_type,
                       monotonic_num_lattices,
                       monotonic_lattice_rank,
                       monotonic_lattice_size,
                       non_monotonic_num_lattices,
                       non_monotonic_lattice_rank,
                       non_monotonic_lattice_size,
                       linear_embedding_calibration_min,
                       linear_embedding_calibration_max,
                       linear_embedding_calibration_num_keypoints,
                       is_monotone=None,
                       lattice_l1_reg=None,
                       lattice_l2_reg=None,
                       lattice_l1_torsion_reg=None,
                       lattice_l2_torsion_reg=None,
                       lattice_l1_laplacian_reg=None,
                       lattice_l2_laplacian_reg=None):
    """Creates an ensemble of lattices with a linear embedding.

  This function constructs the following deep lattice network:
  calibrated_input -> linear_embedding -> calibration -> ensemble of lattices.
  Then ensemble of lattices' output are averaged and bias term is added to make
  a final prediction.

  ensemble of lattices is consists of two parts: monotonic lattices and
  non-monotonic lattices. The input to the monotonic lattices is an output of
  linear_embedding that contains both monotonic and non-monotonic
  calibrated_input. All inputs to the monotonic lattices are set to be monotonic
  to preserve end-to-end monotonicity in the monotonic feature.
  The input to the non-monotonic lattices is an output of linear_embedding that
  only contains non-monotonic calibrated_input. All inputs to the non-monotonic
  lattices are set to be non-monotonic, since we do not need to guarantee
  monotonicity.

  Args:
    calibrated_input_tensor: [batch_size, input_dim] tensor.
    input_dim: (int) input dimnension.
    output_dim: (int) output dimension.
    interpolation_type: defines whether the lattice will interpolate using the
      full hypercube or only the simplex ("hyper-triangle") around the point
      being evaluated. Valid values: 'hypercube' or 'simplex'
    monotonic_num_lattices: (int) number of monotonic lattices in the ensemble
      lattices layer.
    monotonic_lattice_rank: (int) number of inputs to each monotonic lattice in
      the ensemble lattices layer.
    monotonic_lattice_size: (int) lattice cell size for each monotonic lattice
      in the ensemble lattices layer.
    non_monotonic_num_lattices: (int) number of non monotonic lattices in the
      ensemble lattices layer.
    non_monotonic_lattice_rank: (int) number of inputs to each non monotonic
      lattice in the ensemble lattices layer.
    non_monotonic_lattice_size: (int) lattice cell size for each non monotonic
      lattice in the ensemble lattices layer.
    linear_embedding_calibration_min: (float) a minimum input keypoints value
      for linear_embedding calibration.
    linear_embedding_calibration_max: (float) a maximum input keypoints value
      for linear_embedding calibration.
    linear_embedding_calibration_num_keypoints: (int) a number of eypoints for
      linear_embedding calibration.
    is_monotone: (bool, list of booleans) is_monotone[k] == true then
      calibrated_input_tensor[:, k] is considered to be a monotonic input.
    lattice_l1_reg: (float) lattice l1 regularization amount.
    lattice_l2_reg: (float) lattice l2 regularization amount.
    lattice_l1_torsion_reg: (float) lattice l1 torsion regularization amount.
    lattice_l2_torsion_reg: (float) lattice l2 torsion regularization amount.
    lattice_l1_laplacian_reg: (float) lattice l1 laplacian regularization
      amount.
    lattice_l2_laplacian_reg: (float) lattice l2 laplacian regularization
      amount.
  Returns:
    A tuple of (output_tensor, projection_ops, regularization).
  Raises:
    ValueError: If there is no non-monotonic inputs but
    non_monotonic_num_lattices is not zero.
  """
    projections = []
    regularization = None

    # Explictly assign number of lattices to zero for any empty cases.
    if not monotonic_num_lattices:
        monotonic_num_lattices = 0
    if not non_monotonic_num_lattices:
        non_monotonic_num_lattices = 0

    # Step 1. Create a linear embedding.
    if monotonic_num_lattices:
        monotonic_embedding_dim = monotonic_num_lattices * monotonic_lattice_rank
    else:
        monotonic_num_lattices = 0
        monotonic_embedding_dim = 0
    if non_monotonic_num_lattices:
        non_monotonic_embedding_dim = (non_monotonic_num_lattices *
                                       non_monotonic_lattice_rank)
    else:
        non_monotonic_num_lattices = 0
        non_monotonic_embedding_dim = 0

    if is_monotone is not None:
        is_monotone = tools.cast_to_list(is_monotone, input_dim, 'is_monotone')
    with variable_scope.variable_scope('linear_embedding'):
        packed_results = monotone_linear_layers.split_monotone_linear_layer(
            calibrated_input_tensor,
            input_dim,
            monotonic_embedding_dim,
            non_monotonic_embedding_dim,
            is_monotone=is_monotone)
        (monotonic_output, _, non_monotonic_output, _, proj,
         _) = packed_results
        if proj is not None:
            projections.append(proj)

    # Step 2. Create ensemble of monotonic lattices.
    if monotonic_num_lattices == 0:
        m_lattice_outputs = None
    else:
        with variable_scope.variable_scope('monotonic_lattices'):
            m_lattice_outputs, projs, reg = _ensemble_lattices_layer(
                monotonic_output,
                monotonic_embedding_dim,
                output_dim,
                interpolation_type,
                linear_embedding_calibration_min,
                linear_embedding_calibration_max,
                linear_embedding_calibration_num_keypoints,
                monotonic_num_lattices,
                monotonic_lattice_rank,
                monotonic_lattice_size,
                is_monotone=True,
                l1_reg=lattice_l1_reg,
                l2_reg=lattice_l2_reg,
                l1_torsion_reg=lattice_l1_torsion_reg,
                l2_torsion_reg=lattice_l2_torsion_reg,
                l1_laplacian_reg=lattice_l1_laplacian_reg,
                l2_laplacian_reg=lattice_l2_laplacian_reg)
            if projs:
                projections += projs
            regularization = tools.add_if_not_none(regularization, reg)

    # Step 3. Construct non-monotonic ensembles.
    if non_monotonic_output is None and non_monotonic_num_lattices > 0:
        raise ValueError(
            'All input signals are monotonic but the number of non monotonic '
            'lattices is not zero.')
    if non_monotonic_num_lattices == 0:
        n_lattice_outputs = None
    else:
        with variable_scope.variable_scope('non_monotonic_lattices'):
            n_lattice_outputs, projs, reg = _ensemble_lattices_layer(
                non_monotonic_output,
                non_monotonic_embedding_dim,
                output_dim,
                interpolation_type,
                linear_embedding_calibration_min,
                linear_embedding_calibration_max,
                linear_embedding_calibration_num_keypoints,
                non_monotonic_num_lattices,
                non_monotonic_lattice_rank,
                non_monotonic_lattice_size,
                is_monotone=False,
                l1_reg=lattice_l1_reg,
                l2_reg=lattice_l2_reg,
                l1_torsion_reg=lattice_l1_torsion_reg,
                l2_torsion_reg=lattice_l2_torsion_reg,
                l1_laplacian_reg=lattice_l1_laplacian_reg,
                l2_laplacian_reg=lattice_l2_laplacian_reg)
            if projs:
                projections += projs
            regularization = tools.add_if_not_none(regularization, reg)

    # Step 4. Take average to make a final prediction.
    with variable_scope.variable_scope('ensemble_average'):
        output = variable_scope.get_variable(
            name='ensemble_bias',
            initializer=[0.0] * output_dim,
            dtype=calibrated_input_tensor.dtype)
        if m_lattice_outputs:
            output += math_ops.divide(math_ops.add_n(m_lattice_outputs),
                                      monotonic_num_lattices)
        if n_lattice_outputs is not None:
            output += math_ops.divide(math_ops.add_n(n_lattice_outputs),
                                      non_monotonic_num_lattices)

    return (output, projections, regularization)
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
                                 fused_batch_norm):
  """Computes batch norm correction params.

     Before batch normalization is frozen:
     We use batch statistics for batch norm.
       correction_scale = sigma_b/sigma_mv
       correction_recip = 1/correction_scale
       correction_offset = 0

     After batch normalization is frozen:
      correction_scale = sigma_b/sigma_mv
      correction_recip = 1
      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).

     Batch norm is frozen if global_step > bn_freeze_delay.
     The corrections ensure that:
     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
     smoother training as the scaling on the weights changes slowly, rather than
     jump across mini-batches
     b) Changing the values of the corrections allows for one to switch between
     using batch statistics to using moving mean and average, without requiring
     changes to batch_norm


  Args:
    context: The scope under which we look for batch norm params
    match: Object containing required batch norm tensors for correction
      computation.
    freeze_batch_norm_delay: Delay in steps at which computation switches
      from regular batch norm to frozen mean and variance.
    fused_batch_norm: Bool, true if fused batch norm is used.

  Returns:
    A tuple of correction_scale, correction_recip, correction_offset
  """

  g = ops.get_default_graph()
  prefix = '' if not context else context + '/'
  with g.name_scope(prefix + 'batch_norm_correction'):
    recip_sigma_mv = math_ops.rsqrt(
        match.moving_variance_tensor + match.batch_epsilon)
    recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon)
    correction_scale = math_ops.divide(
        recip_sigma_mv, recip_sigma, name='scale_compute')
    correction_scale = array_ops.identity(
        correction_scale, name='correction_scale')
    correction_recip = math_ops.reciprocal(
        correction_scale, name='reciprocal_compute')
    correction_offset = math_ops.multiply(
        match.gamma_tensor,
        match.mean_tensor * recip_sigma -
        match.moving_mean_tensor * recip_sigma_mv,
        name='offset_compute')

    if freeze_batch_norm_delay is not None:
      use_mv_avg = math_ops.greater_equal(
          common.CreateOrGetQuantizationStep(),
          freeze_batch_norm_delay,
          name='use_moving_average')
    else:
      use_mv_avg = False

    bn_decay_zero = 0.0
    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())

    bn_decay_mean_out = utils.smart_cond(
        use_mv_avg,
        lambda: bn_decay_zero,
        lambda: match.bn_decay_mean_tensor,
        name='freeze_moving_mean')
    graph_editor.reroute_ts(
        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
        can_modify=bn_decay_mean_consumers)

    if fused_batch_norm is False:
      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
      bn_decay_var_out = utils.smart_cond(
          use_mv_avg,
          lambda: bn_decay_zero,
          lambda: match.bn_decay_var_tensor,
          name='freeze_moving_var')
      graph_editor.reroute_ts(
          [bn_decay_var_out], [match.bn_decay_var_tensor],
          can_modify=bn_decay_var_consumers)

    correction_recip = utils.smart_cond(
        use_mv_avg,
        lambda: array_ops.ones(correction_scale.shape),
        lambda: correction_recip,
        name='correction_recip')

    correction_offset = utils.smart_cond(
        use_mv_avg,
        lambda: correction_offset,
        lambda: array_ops.zeros(correction_offset.shape),
        name='correction_offset')
  return correction_scale, correction_recip, correction_offset
 def testComplexDiv(self):
     foo = array_ops.constant([1. + 3.j])
     with self.cached_session():
         _ = math_ops.divide(foo, 1.).eval()
         _ = math_ops.div(foo, 2.).eval()