Exemple #1
0
  def testSecondGradient(self):
    with self.test_session() as sess:
      l = constant_op.constant(
          [
              0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0,
              0.5 / 3
          ],
          shape=[12],
          dtype=dtypes.float64,
          name="l")
      f = constant_op.constant(
          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
          shape=[12],
          dtype=dtypes.float64,
          name="f")
      x = nn_ops.softmax_cross_entropy_with_logits(
          labels=l, logits=f, name="xent")
      loss = math_ops.reduce_sum(x)

      gradients = gradients_impl.gradients(loss, [f])[0]

      err = gradient_checker.compute_gradient_error(f, [12], gradients, [12])

      # Check that second derivative is calculated.
      # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
      op_names = [
          op.op_def.name for op in sess.graph.get_operations() if op.op_def
      ]
      self.assertIn("BatchMatMul", op_names)

    print("cross entropy hessian err = ", err)
    self.assertLess(err, 5e-8)
Exemple #2
0
  def testGradient(self):
    with self.test_session() as sess:
      l = constant_op.constant(
          [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
          shape=[3, 4],
          dtype=dtypes.float64,
          name="l")
      f = constant_op.constant(
          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
          shape=[3, 4],
          dtype=dtypes.float64,
          name="f")
      x = nn_ops.softmax_cross_entropy_with_logits(
          labels=l, logits=f, name="xent")
      err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])

      # Check that no extra computation performed. When only first derivative is requested,
      # second derivative must not be computed. So when there is no second derivative,
      # there is no `BatchMatMul` op in the graph.
      op_names = [
          op.op_def.name for op in sess.graph.get_operations() if op.op_def
      ]
      self.assertNotIn("BatchMatMul", op_names)

    print("cross entropy gradient err = ", err)
    self.assertLess(err, 5e-8)
Exemple #3
0
 def benchmarkSingleClass(self):
   for (m, n, p, use_gpu) in itertools.product(
       [128],
       [10, 100, 1000, 10000, 100000],
       [0.001, 0.01, 0.5, 0.99, 1.0],
       [False]):
     k = int(p * n)
     if k == 0:
       continue
     name = "single_class_m_%d_n_%d_k_%g_use_gpu_%s" % (m, n, k, use_gpu)
     device = "/%s:0" % ("gpu" if use_gpu else "cpu")
     with ops.Graph().as_default():
       with ops.device(device):
         labels = constant_op.constant([[1.], [-1.], [0.]],
                                       dtype=dtypes.float32)
         logits = constant_op.constant([[-1.], [0.], [1.]],
                                       dtype=dtypes.float32)
         op = nn_ops.softmax_cross_entropy_with_logits(
             labels=labels, logits=logits)
       with session.Session() as sess:
         r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
         gb_processed_input = m * n / 1.0e9
         throughput = gb_processed_input / r["wall_time"]
         print("Benchmark: %s \t wall_time: %0.03g s \t "
               "Throughput: %0.03g GB/s" % (name, r["wall_time"], throughput))
         sys.stdout.flush()
Exemple #4
0
 def _entropy(self):
   logits_2d = array_ops.reshape(
       self.logits, array_ops.pack([-1, self.num_classes]))
   histogram_2d = nn_ops.softmax(logits_2d)
   ret = array_ops.reshape(
       nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d),
       self.batch_shape())
   ret.set_shape(self.get_batch_shape())
   return ret
Exemple #5
0
 def testZeroDimension(self):
   features = np.zeros([0, 2, 4]).astype(np.float32)
   labels = np.zeros([0, 2, 4]).astype(np.float32)
   np_loss, _ = self._npXent(features, labels)
   with self.test_session(use_gpu=True) as sess:
     loss = nn_ops.softmax_cross_entropy_with_logits(
         labels=labels, logits=features)
     tf_loss = sess.run(loss)
   self.assertAllEqual(np_loss, tf_loss)
Exemple #6
0
 def _testXentWrapper(self, np_features, np_labels, dim=-1, use_gpu=False):
   np_loss, _ = self._npXent(np_features, np_labels, dim=dim)
   with self.test_session(use_gpu=use_gpu) as sess:
     loss = nn_ops.softmax_cross_entropy_with_logits(
         labels=np_labels, logits=np_features, dim=dim)
     tf_loss = sess.run(loss)
   print("np_loss:", np_loss)
   print("tf_loss:", tf_loss)
   self.assertAllCloseAccordingToType(np_loss, tf_loss)
Exemple #7
0
 def _entropy(self):
     if self.logits.get_shape().ndims == 2:
         logits_2d = self.logits
     else:
         logits_2d = array_ops.reshape(self.logits, [-1, self.num_classes])
     histogram_2d = nn_ops.softmax(logits_2d)
     ret = array_ops.reshape(nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape())
     ret.set_shape(self.get_batch_shape())
     return ret
Exemple #8
0
def sampled_softmax_loss(weights, biases, inputs, labels, num_sampled,
                         num_classes, num_true=1,
                         sampled_values=None,
                         remove_accidental_hits=True,
                         name="sampled_softmax_loss"):
  """Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  At inference time, you can compute full softmax probabilities with the
  expression `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`.

  See our [Candidate Sampling Algorithms Reference]
  (http://www.tensorflow.org/extras/candidate_sampling.pdf)

  Also see Section 3 of http://arxiv.org/abs/1412.2007 for the math.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        True.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  """
  logits, labels = _compute_sampled_logits(
      weights, biases, inputs, labels, num_sampled, num_classes,
      num_true=num_true,
      sampled_values=sampled_values,
      subtract_log_q=True,
      remove_accidental_hits=remove_accidental_hits,
      name=name)
  sampled_losses = nn_ops.softmax_cross_entropy_with_logits(logits, labels)
  # sampled_losses is a [batch_size] tensor.
  return sampled_losses
Exemple #9
0
 def entropy(self, name="sample"):
   with ops.name_scope(self.name):
     with ops.op_scope([], name):
       logits_2d = array_ops.reshape(
           self.logits, array_ops.pack([-1, self.num_classes]))
       histogram_2d = nn_ops.softmax(logits_2d)
       ret = array_ops.reshape(
           nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d),
           self.batch_shape())
       ret.set_shape(self.get_batch_shape())
       return ret
Exemple #10
0
def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols,
                             average_across_timesteps=True,
                             softmax_loss_function=None, name=None):
  """Weighted cross-entropy loss for a sequence of logits (per example).

  Args:
    logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols].
    targets: list of 1D batch-sized int32 Tensors of the same length as logits.
    weights: list of 1D batch-sized float-Tensors of the same length as logits.
    num_decoder_symbols: integer, number of decoder symbols (output classes).
    average_across_timesteps: If set, divide the returned cost by the total
      label weight.
    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
      to be used instead of the standard softmax (the default if this is None).
    name: optional name for this operation, default: "sequence_loss_by_example".

  Returns:
    1D batch-sized float Tensor: the log-perplexity for each sequence.

  Raises:
    ValueError: if len(logits) is different from len(targets) or len(weights).
  """
  if len(targets) != len(logits) or len(weights) != len(logits):
    raise ValueError("Lengths of logits, weights, and targets must be the same "
                     "%d, %d, %d." % (len(logits), len(weights), len(targets)))
  with ops.op_scope(logits + targets + weights, name,
                    "sequence_loss_by_example"):
    batch_size = array_ops.shape(targets[0])[0]
    log_perp_list = []
    length = batch_size * num_decoder_symbols
    for i in xrange(len(logits)):
      if softmax_loss_function is None:
        # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so
        # we need to first cast targets into a dense representation, and as
        # SparseToDense does not accept batched inputs, we need to do this by
        # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy,
        # rewrite this method.
        indices = targets[i] + num_decoder_symbols * math_ops.range(batch_size)
        with ops.device("/cpu:0"):  # Sparse-to-dense must be on CPU for now.
          dense = sparse_ops.sparse_to_dense(
              indices, array_ops.expand_dims(length, 0), 1.0,
              0.0)
        target = array_ops.reshape(dense, [-1, num_decoder_symbols])
        crossent = nn_ops.softmax_cross_entropy_with_logits(
            logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i))
      else:
        crossent = softmax_loss_function(logits[i], targets[i])
      log_perp_list.append(crossent * weights[i])
    log_perps = math_ops.add_n(log_perp_list)
    if average_across_timesteps:
      total_size = math_ops.add_n(weights)
      total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
      log_perps /= total_size
  return log_perps
Exemple #11
0
 def _entropy(self):
   if self.logits.get_shape().ndims == 2:
     logits_2d = self.logits
   else:
     logits_2d = array_ops.reshape(self.logits, [-1, self.event_size])
   histogram_2d = nn_ops.softmax(logits_2d)
   ret = array_ops.reshape(
       nn_ops.softmax_cross_entropy_with_logits(labels=histogram_2d,
                                                logits=logits_2d),
       self.batch_shape_tensor())
   ret.set_shape(self.batch_shape)
   return ret
 def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias):
   features = nn_ops.relu(
       nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
   logits = nn_ops.xw_plus_b(
       features, softmax_weight, softmax_bias, name="logits")
   labels = constant_op.constant(
       label_data.tolist(),
       shape=[batch, classes],
       dtype=dtypes.float64,
       name="labels")
   cost = nn_ops.softmax_cross_entropy_with_logits(
       labels=labels, logits=logits, name="cost")
   return cost
 def testGradient(self):
   with self.test_session():
     l = constant_op.constant(
         [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
         shape=[3, 4],
         dtype=dtypes.float64,
         name="l")
     f = constant_op.constant(
         [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
         shape=[3, 4],
         dtype=dtypes.float64,
         name="f")
     x = nn_ops.softmax_cross_entropy_with_logits(f, l, name="xent")
     err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
   print("cross entropy gradient err = ", err)
   self.assertLess(err, 5e-8)
def _sparse_vs_dense_xent_benchmark_dense(labels, logits):
  labels = array_ops.identity(labels)
  logits = array_ops.identity(logits)
  with ops_lib.device("/cpu:0"):  # Sparse-to-dense must be on CPU
    batch_size = array_ops.shape(logits)[0]
    num_entries = array_ops.shape(logits)[1]
    length = batch_size * num_entries
    labels += num_entries * math_ops.range(batch_size)
    target = sparse_ops.sparse_to_dense(labels,
                                        array_ops.stack([length]), 1.0, 0.0)
  target = array_ops.reshape(target, array_ops.stack([-1, num_entries]))
  crossent = nn_ops.softmax_cross_entropy_with_logits(
      logits, target, name="SequenceLoss/CrossEntropy")
  crossent_sum = math_ops.reduce_sum(crossent)
  grads = gradients_impl.gradients([crossent_sum], [logits])[0]

  return (crossent_sum, grads)
  def _log_prob(self, x):
    x = self._assert_valid_sample(x)
    # broadcast logits or x if need be.
    logits = self.logits
    if (not x.get_shape().is_fully_defined() or
        not logits.get_shape().is_fully_defined() or
        x.get_shape() != logits.get_shape()):
      logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
      x = array_ops.ones_like(logits, dtype=x.dtype) * x

    logits_shape = array_ops.shape(math_ops.reduce_sum(logits, -1))
    logits_2d = array_ops.reshape(logits, [-1, self.event_size])
    x_2d = array_ops.reshape(x, [-1, self.event_size])
    ret = -nn_ops.softmax_cross_entropy_with_logits(labels=x_2d,
                                                    logits=logits_2d)
    # Reshape back to user-supplied batch and sample dims prior to 2D reshape.
    ret = array_ops.reshape(ret, logits_shape)
    return ret
  def testSecondGradient(self):
    with self.test_session():
      l = constant_op.constant([0.0, 0.0, 1.0, 0.0,
                                1.0, 0.0, 0.0, 0.0,
                                0.0, 0.5, 0.0, 0.5], shape=[12],
                               dtype=dtypes.float64, name="l")
      f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
                                0.1, 0.4, 0.9, 1.6,
                                0.1, 0.8, 2.7, 6.4], shape=[12],
                               dtype=dtypes.float64, name="f")
      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
                                                   name="xent")
      loss = math_ops.reduce_mean(x)

    # Taking ths second gradient should fail, since it is not
    # yet supported.
    with self.assertRaisesRegexp(LookupError,
                                 "explicitly disabled"):
      _ = gradients_impl.hessians(loss, [f])
  def _log_prob(self, x):
    x = ops.convert_to_tensor(x, name="x")
    # broadcast logits or x if need be.
    logits = self.logits
    if (not x.get_shape().is_fully_defined() or
        not logits.get_shape().is_fully_defined() or
        x.get_shape() != logits.get_shape()):
      logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
      x = array_ops.ones_like(logits, dtype=x.dtype) * x

    logits_shape = array_ops.shape(logits)
    if logits.get_shape().ndims == 2:
      logits_2d = logits
      x_2d = x
    else:
      logits_2d = array_ops.reshape(logits, [-1, self.num_classes])
      x_2d = array_ops.reshape(x, [-1, self.num_classes])
    ret = -nn_ops.softmax_cross_entropy_with_logits(logits_2d, x_2d)
    ret = array_ops.reshape(ret, logits_shape)
    return ret
Exemple #18
0
  def _log_prob(self, x):
    x = self._assert_valid_sample(x)
    # broadcast logits or x if need be.
    logits = self.logits
    if (not x.get_shape().is_fully_defined() or
        not logits.get_shape().is_fully_defined() or
        x.get_shape() != logits.get_shape()):
      logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
      x = array_ops.ones_like(logits, dtype=x.dtype) * x

    logits_shape = array_ops.shape(logits)
    if logits.get_shape().ndims == 2:
      logits_2d = logits
      x_2d = x
    else:
      logits_2d = array_ops.reshape(logits, [-1, self.event_size])
      x_2d = array_ops.reshape(x, [-1, self.event_size])
    ret = -nn_ops.softmax_cross_entropy_with_logits(labels=x_2d,
                                                    logits=logits_2d)
    ret = array_ops.reshape(ret, logits_shape)
    return ret
Exemple #19
0
 def first(x):
   l = tensor.Tensor([[0.0]])
   x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x)
   x = math_ops.reduce_sum(x, tensor.Tensor([0]))
   return x
Exemple #20
0
 def loss(x, l):
   return math_ops.reduce_mean(
       nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l),
       tensor.Tensor([0]))
  def _BuildAndTestMiniMNIST(self, param_index, tag):
    # Fix seed to avoid occasional flakiness
    np.random.seed(6)

    # Hyperparameters
    batch = 3
    inputs = 16
    features = 32
    classes = 10

    # Define the parameters
    inp_data = np.random.random_sample(inputs * batch)
    hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs)
    hidden_bias_data = np.random.random_sample(features)
    sm_weight_data = np.random.randn(features * classes) / np.sqrt(features)
    sm_bias_data = np.random.random_sample(classes)

    # special care for labels since they need to be normalized per batch
    label_data = np.random.random(batch * classes).reshape((batch, classes))
    s = label_data.sum(axis=1)
    label_data /= s[:, None]

    with self.session(use_gpu=True):
      # We treat the inputs as "parameters" here
      inp = constant_op.constant(
          inp_data.tolist(),
          shape=[batch, inputs],
          dtype=dtypes.float64,
          name="inp")
      hidden_weight = constant_op.constant(
          hidden_weight_data.tolist(),
          shape=[inputs, features],
          dtype=dtypes.float64,
          name="hidden_weight")
      hidden_bias = constant_op.constant(
          hidden_bias_data.tolist(),
          shape=[features],
          dtype=dtypes.float64,
          name="hidden_bias")
      softmax_weight = constant_op.constant(
          sm_weight_data.tolist(),
          shape=[features, classes],
          dtype=dtypes.float64,
          name="softmax_weight")
      softmax_bias = constant_op.constant(
          sm_bias_data.tolist(),
          shape=[classes],
          dtype=dtypes.float64,
          name="softmax_bias")

      # List all the parameter so that we can test them one at a time
      all_params = [
          inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
      ]
      param_sizes = [
          [batch, inputs],  # inp
          [inputs, features],  # hidden_weight,
          [features],  # hidden_bias
          [features, classes],  # softmax_weight,
          [classes]
      ]  # softmax_bias

      # Now, Building MNIST
      features = nn_ops.relu(
          nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
      logits = nn_ops.xw_plus_b(
          features, softmax_weight, softmax_bias, name="logits")
      labels = constant_op.constant(
          label_data.tolist(),
          shape=[batch, classes],
          dtype=dtypes.float64,
          name="labels")
      cost = nn_ops.softmax_cross_entropy_with_logits(
          labels=labels, logits=logits, name="cost")

      # Test the gradients.
      err = gradient_checker.compute_gradient_error(
          all_params[param_index],
          param_sizes[param_index],
          cost, [batch],
          delta=1e-5)

    tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
    return err
Exemple #22
0
 def first(x):
     l = constant_op.constant([[0.0]])
     x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x)
     x = math_ops.reduce_sum(x, constant_op.constant([0]))
     return x
Exemple #23
0
 def loss(x, l):
   return math_ops.reduce_mean(
       nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l),
       constant_op.constant([0]))
Exemple #24
0
def sampled_softmax_loss(weights,
                         biases,
                         inputs,
                         labels,
                         num_sampled,
                         num_classes,
                         num_true=1,
                         sampled_values=None,
                         remove_accidental_hits=True,
                         name="sampled_softmax_loss"):
    """Computes and returns the sampled softmax training loss.

  This is a faster way to train a softmax classifier over a huge number of
  classes.

  This operation is for training only.  It is generally an underestimate of
  the full softmax loss.

  At inference time, you can compute full softmax probabilities with the
  expression `tf.nn.softmax(tf.matmul(inputs, weights) + biases)`.

  See our [Candidate Sampling Algorithms Reference]
  (../../extras/candidate_sampling.pdf)

  Also see Section 3 of http://arxiv.org/abs/1412.2007 for the math.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        [num_classes, dim].  The (possibly-sharded) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        True.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  """
    logits, labels = _compute_sampled_logits(
        weights,
        biases,
        inputs,
        labels,
        num_sampled,
        num_classes,
        num_true=num_true,
        sampled_values=sampled_values,
        subtract_log_q=True,
        remove_accidental_hits=remove_accidental_hits,
        name=name)
    sampled_losses = nn_ops.softmax_cross_entropy_with_logits(logits, labels)
    # sampled_losses is a [batch_size] tensor.
    return sampled_losses
Exemple #25
0
 def first(x):
   l = tensor.Tensor([[0.0]])
   x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x)
   x = math_ops.reduce_sum(x, tensor.Tensor([0]))
   return x
Exemple #26
0
 def cost_function(labels, logits, num_classes):
     sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
         labels=labels, logits=logits)
     return sampled_losses
Exemple #27
0
 def loss(x, l):
   return math_ops.reduce_mean(
       nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l),
       tensor.Tensor([0]))
Exemple #28
0
def test():
    model = Model()
    vocabulary_size = 10
    embedding_size = 2
    num_sampled = 3  # Number of negative examples to sample.
    num_true = 1

    graph = tf.Graph()

    with graph.as_default():
        # Input data.
        train_dataset = tf.placeholder(tf.int32)
        train_labels = tf.placeholder(tf.int32, shape=[None, num_true])

        # Variables.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

        # Model.
        # Look up embeddings for inputs.
        embed = tf.nn.embedding_lookup(embeddings, train_dataset)
        # Compute the softmax loss, using a sample of the negative labels each time.
        logits, labels = yba_sampled_softmax(model=model,
                                             weights=embeddings,
                                             inputs=embed,
                                             labels=train_labels,
                                             num_sampled=num_sampled,
                                             num_classes=vocabulary_size,
                                             num_true=num_true)
        loss = tf.reduce_mean(
            nn_ops.softmax_cross_entropy_with_logits(labels=labels,
                                                     logits=logits))

        # Optimizer.
        # Note: The optimizer will optimize the softmax_weights AND the embeddings.
        # This is because the embeddings are defined as a variable quantity and the
        # optimizer's `minimize` method will by default modify all variable quantities
        # that contribute to the tensor it is passed.
        # See docs on `tf.train.Optimizer.minimize()` for more details.
        optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        print('Initialized')
        average_loss = 0
        for step in range(100):
            # batch_data = np.array(
            #     [[0, 1, 1, 0, 0], [1, 0, 0, 1, 0], [1, 0, 0, 1, 0],
            #      [0, 1, 1, 0, 1], [0, 0, 0, 1, 0]])
            batch_data = np.array(
                [7, 7, 7, 7, 7])
            batch_labels = np.array([[1], [2], [3], [4], [5]])
            feed_dict = {train_dataset: batch_data, train_labels: batch_labels}
            _, l, model_logits, model_labels, sampled_values = \
                session.run([
                    optimizer, loss,
                    model.logits,
                    model.labels,
                    model.sampled_values],
                    feed_dict=feed_dict)
            print('loss', l)
            print('sampled_values', sampled_values)
            if step == 99:
                x = 5
Exemple #29
0
 def first(x):
   l = constant_op.constant([[0.0]])
   x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x)
   x = math_ops.reduce_sum(x, constant_op.constant([0]))
   return x
Exemple #30
0
def sequence_loss_by_example(logits,
                             targets,
                             weights,
                             num_decoder_symbols,
                             average_across_timesteps=True,
                             softmax_loss_function=None,
                             name=None):
    """Weighted cross-entropy loss for a sequence of logits (per example).

  Args:
    logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols].
    targets: list of 1D batch-sized int32 Tensors of the same length as logits.
    weights: list of 1D batch-sized float-Tensors of the same length as logits.
    num_decoder_symbols: integer, number of decoder symbols (output classes).
    average_across_timesteps: If set, divide the returned cost by the total
      label weight.
    softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch
      to be used instead of the standard softmax (the default if this is None).
    name: optional name for this operation, default: "sequence_loss_by_example".

  Returns:
    1D batch-sized float Tensor: the log-perplexity for each sequence.

  Raises:
    ValueError: if len(logits) is different from len(targets) or len(weights).
  """
    if len(targets) != len(logits) or len(weights) != len(logits):
        raise ValueError(
            "Lengths of logits, weights, and targets must be the same "
            "%d, %d, %d." % (len(logits), len(weights), len(targets)))
    with ops.name_scope(name, "sequence_loss_by_example",
                        logits + targets + weights):
        batch_size = array_ops.shape(targets[0])[0]
        log_perp_list = []
        length = batch_size * num_decoder_symbols
        for i in xrange(len(logits)):
            if softmax_loss_function is None:
                # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so
                # we need to first cast targets into a dense representation, and as
                # SparseToDense does not accept batched inputs, we need to do this by
                # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy,
                # rewrite this method.
                indices = targets[i] + num_decoder_symbols * math_ops.range(
                    batch_size)
                with ops.device(
                        "/cpu:0"):  # Sparse-to-dense must be on CPU for now.
                    dense = sparse_ops.sparse_to_dense(
                        indices, array_ops.expand_dims(length, 0), 1.0, 0.0)
                target = array_ops.reshape(dense, [-1, num_decoder_symbols])
                crossent = nn_ops.softmax_cross_entropy_with_logits(
                    logits=logits[i],
                    labels=target,
                    name="SequenceLoss/CrossEntropy{0}".format(i))
            else:
                crossent = softmax_loss_function(logits[i], targets[i])
            log_perp_list.append(crossent * weights[i])
        log_perps = math_ops.add_n(log_perp_list)
        if average_across_timesteps:
            total_size = math_ops.add_n(weights)
            total_size += 1e-12  # Just to avoid division by 0 for all-0 weights.
            log_perps /= total_size
    return log_perps
Exemple #31
0
 def loss(x, l):
     return math_ops.reduce_mean(
         nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l),
         constant_op.constant([0]))
    def _BuildAndTestMiniMNIST(self, param_index, tag):
        # Fix seed to avoid occasional flakiness
        np.random.seed(6)

        # Hyperparameters
        batch = 3
        inputs = 16
        features = 32
        classes = 10

        # Define the parameters
        inp_data = np.random.random_sample(inputs * batch)
        hidden_weight_data = np.random.randn(
            inputs * features) / np.sqrt(inputs)
        hidden_bias_data = np.random.random_sample(features)
        sm_weight_data = np.random.randn(
            features * classes) / np.sqrt(features)
        sm_bias_data = np.random.random_sample(classes)

        # special care for labels since they need to be normalized per batch
        label_data = np.random.random(batch * classes).reshape(
            (batch, classes))
        s = label_data.sum(axis=1)
        label_data /= s[:, None]

        with self.test_session(use_gpu=True):
            # We treat the inputs as "parameters" here
            inp = constant_op.constant(inp_data.tolist(),
                                       shape=[batch, inputs],
                                       dtype=dtypes.float64,
                                       name="inp")
            hidden_weight = constant_op.constant(hidden_weight_data.tolist(),
                                                 shape=[inputs, features],
                                                 dtype=dtypes.float64,
                                                 name="hidden_weight")
            hidden_bias = constant_op.constant(hidden_bias_data.tolist(),
                                               shape=[features],
                                               dtype=dtypes.float64,
                                               name="hidden_bias")
            softmax_weight = constant_op.constant(sm_weight_data.tolist(),
                                                  shape=[features, classes],
                                                  dtype=dtypes.float64,
                                                  name="softmax_weight")
            softmax_bias = constant_op.constant(sm_bias_data.tolist(),
                                                shape=[classes],
                                                dtype=dtypes.float64,
                                                name="softmax_bias")

            # List all the parameter so that we can test them one at a time
            all_params = [
                inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
            ]
            param_sizes = [
                [batch, inputs],  # inp
                [inputs, features],  # hidden_weight,
                [features],  # hidden_bias
                [features, classes],  # softmax_weight,
                [classes]
            ]  # softmax_bias

            # Now, Building MNIST
            features = nn_ops.relu(nn_ops.xw_plus_b(inp, hidden_weight,
                                                    hidden_bias),
                                   name="features")
            logits = nn_ops.xw_plus_b(features,
                                      softmax_weight,
                                      softmax_bias,
                                      name="logits")
            labels = constant_op.constant(label_data.tolist(),
                                          shape=[batch, classes],
                                          dtype=dtypes.float64,
                                          name="labels")
            cost = nn_ops.softmax_cross_entropy_with_logits(labels=labels,
                                                            logits=logits,
                                                            name="cost")

            # Test the gradients.
            err = gradient_checker.compute_gradient_error(
                all_params[param_index],
                param_sizes[param_index],
                cost, [batch],
                delta=1e-5)

        tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
        return err
Exemple #33
0
    def __init__(self, sess, config, data_feed, log_dir):

        vocab_size = len(data_feed.vocab)
        self.data_feed = data_feed

        with tf.name_scope("io"):
            self.inputs = tf.placeholder(dtype=tf.int32,
                                         shape=(None, None),
                                         name="input_seq")
            self.input_lens = tf.placeholder(dtype=tf.int32,
                                             shape=(None, ),
                                             name="seq_len")
            self.da_labels = tf.placeholder(dtype=tf.int32,
                                            shape=(None, ),
                                            name="dialog_acts")
            self.senti_labels = tf.placeholder(
                dtype=tf.float32,
                shape=(None, data_feed.feature_size[data_feed.SENTI_ID]),
                name="sentiments")

            self.learning_rate = tf.Variable(float(config.init_lr),
                                             trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * config.lr_decay)

        max_sent_len = array_ops.shape(self.inputs)[1]
        batch_size = array_ops.shape(self.inputs)[0]

        with variable_scope.variable_scope("word-embedding"):
            embedding = tf.get_variable("embedding",
                                        [vocab_size, config.embed_size],
                                        dtype=tf.float32)
            input_embedding = embedding_ops.embedding_lookup(
                embedding,
                tf.squeeze(tf.reshape(self.inputs, [-1, 1]), squeeze_dims=[1]))

            input_embedding = tf.reshape(input_embedding,
                                         [-1, max_sent_len, config.embed_size])

        with variable_scope.variable_scope("rnn"):
            if config.cell_type == "gru":
                cell = rnn_cell.GRUCell(config.cell_size)
            elif config.cell_type == "lstm":
                cell = rnn_cell.LSTMCell(config.cell_size,
                                         use_peepholes=False,
                                         forget_bias=1.0)
            elif config.cell_type == "rnn":
                cell = rnn_cell.BasicRNNCell(config.cell_size)
            else:
                raise ValueError("unknown RNN type")

            if config.keep_prob < 1.0:
                cell = rnn_cell.DropoutWrapper(
                    cell,
                    output_keep_prob=config.keep_prob,
                    input_keep_prob=config.keep_prob)

            if config.num_layer > 1:
                cell = rnn_cell.MultiRNNCell([cell] * config.num_layer,
                                             state_is_tuple=True)

            # and enc_last_state will be same as the true last state
            outputs, _ = tf.nn.dynamic_rnn(
                cell,
                input_embedding,
                dtype=tf.float32,
                sequence_length=self.input_lens,
            )
            # get the TRUE last outputs
            last_outputs = tf.reduce_sum(
                tf.mul(
                    outputs,
                    tf.expand_dims(
                        tf.one_hot(self.input_lens - 1, max_sent_len), -1)), 1)

            self.dialog_acts = self.fnn(
                last_outputs, data_feed.feature_size[data_feed.DA_ID], [100],
                "dialog_act_fnn")
            self.sentiments = self.fnn(
                last_outputs, data_feed.feature_size[data_feed.SENTI_ID],
                [100], "setiment_fnn")

        self.loss = tf.reduce_sum(nn_ops.sparse_softmax_cross_entropy_with_logits(self.dialog_acts, self.da_labels)) \
                    + tf.reduce_sum(nn_ops.softmax_cross_entropy_with_logits(self.sentiments, self.senti_labels))
        self.loss /= tf.to_float(batch_size)

        tf.scalar_summary("entropy_loss", self.loss)
        self.summary_op = tf.merge_all_summaries()

        # weight decay
        tvars = tf.trainable_variables()
        for v in tvars:
            print("Trainable %s" % v.name)
        # optimization
        if config.op == "adam":
            print("Use Adam")
            optimizer = tf.train.AdamOptimizer(self.learning_rate)
        elif config.op == "rmsprop":
            print("Use RMSProp")
            optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        else:
            print("Use SGD")
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)

        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars),
                                          config.grad_clip)
        self.train_ops = optimizer.apply_gradients(zip(grads, tvars))
        self.saver = tf.train.Saver(tf.all_variables(),
                                    write_version=tf.train.SaverDef.V2)

        if log_dir is not None:
            train_log_dir = os.path.join(log_dir, "train")
            print("Save summary to %s" % log_dir)
            self.train_summary_writer = tf.train.SummaryWriter(
                train_log_dir, sess.graph)