Beispiel #1
0
    def testSampledSoftmaxLoss(self):
        # A simple test to verify the numerics.

        def _SoftmaxCrossEntropyWithLogits(logits, targets):
            # logits, targets: float arrays of the same shape.
            assert logits.shape == targets.shape
            stable_exp_logits = np.exp(logits - np.amax(logits, axis=1, keepdims=True))
            pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
            return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)

        weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
        labels = [0, 1, 2]
        true_w, true_b = weights[labels], biases[labels]
        sampled = [1, 0, 2, 3]
        num_sampled = len(sampled)
        true_exp = np.full([self._batch_size, 1], fill_value=0.5, dtype=np.float32)
        sampled_exp = np.full([num_sampled], fill_value=0.5, dtype=np.float32)
        sampled_w, sampled_b = weights[sampled], biases[sampled]
        test_sampled_vals = (sampled, true_exp, sampled_exp)

        with self.test_session():
            logits_np, labels_np = self._ComputeSampledLogitsNP(
                true_w, true_b, sampled_w, sampled_b, hidden_acts, true_expected=true_exp, sampled_expected=sampled_exp
            )
            sampled_softmax_loss_np = _SoftmaxCrossEntropyWithLogits(logits_np, labels_np)

            labels_tf = constant_op.constant(labels, shape=(self._batch_size, 1))
            weights_tf = constant_op.constant(weights)
            biases_tf = constant_op.constant(biases)
            inputs_tf = constant_op.constant(hidden_acts)

            sampled_softmax_loss_tf = nn.sampled_softmax_loss(
                weights_tf,
                biases_tf,
                inputs_tf,
                labels_tf,
                num_sampled=1,
                num_classes=self._num_classes,
                num_true=1,
                sampled_values=test_sampled_vals,
                remove_accidental_hits=False,
            )

            self.assertAllClose(sampled_softmax_loss_np, sampled_softmax_loss_tf.eval(), 1e-4)

            # Test with sharded weights
            sampled_softmax_loss_tf = nn.sampled_softmax_loss(
                [constant_op.constant(shard) for shard in sharded_weights],
                biases_tf,
                inputs_tf,
                labels_tf,
                num_sampled=1,
                num_classes=self._num_classes,
                num_true=1,
                sampled_values=test_sampled_vals,
                remove_accidental_hits=False,
            )

            self.assertAllClose(sampled_softmax_loss_np, sampled_softmax_loss_tf.eval(), 1e-4)
Beispiel #2
0
  def _testCompareWithNN(self, weights, biases, partition_strategy):
    with ops.Graph().as_default():
      loss = sampling_ops.rank_sampled_softmax_loss(
          weights=weights(),
          biases=biases(),
          labels=self._labels(),
          inputs=self._inputs(),
          num_sampled=self._num_sampled,
          num_resampled=self._num_resampled,
          num_classes=self._num_classes,
          num_true=self._num_true,
          sampled_values=self._sampled_values,
          resampling_temperature=1.,
          remove_accidental_hits=self._remove_accidental_hits,
          partition_strategy=partition_strategy)
      loss_nn = nn.sampled_softmax_loss(
          weights=weights(),
          biases=biases(),
          labels=self._labels(),
          inputs=self._inputs(),
          num_sampled=self._num_resampled,
          num_classes=self._num_classes,
          num_true=self._num_true,
          sampled_values=self._resampled_values,
          remove_accidental_hits=self._remove_accidental_hits,
          partition_strategy=partition_strategy)
      with self.cached_session() as sess:
        loss_val = sess.run(loss)
        loss_nn_val = sess.run(loss_nn)

    self.assertAllClose(loss_val, loss_nn_val)
  def _testCompareWithNN(self, weights, biases, partition_strategy):
    with ops.Graph().as_default():
      loss = sampling_ops.rank_sampled_softmax_loss(
          weights=weights(),
          biases=biases(),
          labels=self._labels(),
          inputs=self._inputs(),
          num_sampled=self._num_sampled,
          num_resampled=self._num_resampled,
          num_classes=self._num_classes,
          num_true=self._num_true,
          sampled_values=self._sampled_values,
          resampling_temperature=1.,
          remove_accidental_hits=self._remove_accidental_hits,
          partition_strategy=partition_strategy)
      loss_nn = nn.sampled_softmax_loss(
          weights=weights(),
          biases=biases(),
          labels=self._labels(),
          inputs=self._inputs(),
          num_sampled=self._num_resampled,
          num_classes=self._num_classes,
          num_true=self._num_true,
          sampled_values=self._resampled_values,
          remove_accidental_hits=self._remove_accidental_hits,
          partition_strategy=partition_strategy)
      with self.test_session() as sess:
        loss_val = sess.run(loss)
        loss_nn_val = sess.run(loss_nn)

    self.assertAllClose(loss_val, loss_nn_val)
Beispiel #4
0
    def testSampledSoftmaxLoss(self):
        # A simple test to verify the numerics.

        def _SoftmaxCrossEntropyWithLogits(logits, targets):
            # logits, targets: float arrays of the same shape.
            assert logits.shape == targets.shape
            stable_exp_logits = np.exp(logits -
                                       np.amax(logits, axis=1, keepdims=True))
            pred = stable_exp_logits / np.sum(
                stable_exp_logits, 1, keepdims=True)
            return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)

        weights, biases, hidden_acts = self._GenerateTestInputs()
        labels = [0, 1, 2]
        true_w, true_b = weights[labels], biases[labels]
        sampled = [1, 0, 2, 3]
        num_sampled = len(sampled)
        true_exp = np.full([self._batch_size, 1],
                           fill_value=0.5,
                           dtype=np.float32)
        sampled_exp = np.full([num_sampled], fill_value=0.5, dtype=np.float32)
        sampled_w, sampled_b = weights[sampled], biases[sampled]
        test_sampled_vals = (sampled, true_exp, sampled_exp)

        with self.test_session():
            logits_np, labels_np = self._ComputeSampledLogitsNP(
                true_w,
                true_b,
                sampled_w,
                sampled_b,
                hidden_acts,
                true_expected=true_exp,
                sampled_expected=sampled_exp)
            sampled_softmax_loss_np = _SoftmaxCrossEntropyWithLogits(
                logits_np, labels_np)

            labels_tf = constant_op.constant(labels,
                                             shape=(self._batch_size, 1))
            weights_tf = constant_op.constant(weights)
            biases_tf = constant_op.constant(biases)
            inputs_tf = constant_op.constant(hidden_acts)

            sampled_softmax_loss_tf = nn.sampled_softmax_loss(
                weights_tf,
                biases_tf,
                inputs_tf,
                labels_tf,
                num_sampled=1,
                num_classes=self._num_classes,
                num_true=1,
                sampled_values=test_sampled_vals,
                remove_accidental_hits=False)

            self.assertAllClose(sampled_softmax_loss_np,
                                sampled_softmax_loss_tf.eval(), 1e-4)
Beispiel #5
0
  def _testCompareWithNNTemperature(self, temperature, resampled):
    weights = [[1., 2.], [3., 4.]]  # two sampled classes
    inputs = [[6., -5. / 2.], [-11., 21. / 2.]]
    # Let w0, w1 = weights of sampled classes (biases set to 0 for simplicity)
    # Let x0, x1 = inputs
    # logits:
    #   w0.x0 = 1
    #   w0.x1 = 10
    #   w1.x0 = 8
    #   w1.x1 = 9
    # Resampling 1 class with temperature = t will pick the larger of:
    #   exp(1/t) + exp(10/t)  ==> w0, for values of t < 2.12
    #   exp(8/t) + exp(9/t)   ==> w1, for values of t > 2.13
    num_sampled = 2
    num_resampled = 1
    num_classes = 2
    num_true = 1
    sampled_values = [0, 1], [[1.], [1.]], [1., 1.]
    resampled_values = [resampled], [[1.], [1.]], [1.]
    remove_accidental_hits = False
    with ops.Graph().as_default():
      weights = constant_op.constant(weights)
      biases = constant_op.constant([0., 0.])
      labels = constant_op.constant([[0], [1]], dtype=dtypes.int64)
      inputs = constant_op.constant(inputs)
      loss = sampling_ops.rank_sampled_softmax_loss(
          weights=weights,
          biases=biases,
          labels=labels,
          inputs=inputs,
          num_sampled=num_sampled,
          num_resampled=num_resampled,
          num_classes=num_classes,
          num_true=num_true,
          sampled_values=sampled_values,
          resampling_temperature=constant_op.constant(temperature),
          remove_accidental_hits=remove_accidental_hits,
          partition_strategy='div')
      loss_nn = nn.sampled_softmax_loss(
          weights=weights,
          biases=biases,
          labels=labels,
          inputs=inputs,
          num_sampled=num_resampled,
          num_classes=num_classes,
          num_true=num_true,
          sampled_values=resampled_values,
          remove_accidental_hits=remove_accidental_hits,
          partition_strategy='div')
      with self.cached_session() as sess:
        loss_val = sess.run(loss)
        loss_nn_val = sess.run(loss_nn)

    self.assertAllClose(loss_val, loss_nn_val)
  def _testCompareWithNNTemperature(self, temperature, resampled):
    weights = [[1., 2.], [3., 4.]]  # two sampled classes
    inputs = [[6., -5. / 2.], [-11., 21. / 2.]]
    # Let w0, w1 = weights of sampled classes (biases set to 0 for simplicity)
    # Let x0, x1 = inputs
    # logits:
    #   w0.x0 = 1
    #   w0.x1 = 10
    #   w1.x0 = 8
    #   w1.x1 = 9
    # Resampling 1 class with temperature = t will pick the larger of:
    #   exp(1/t) + exp(10/t)  ==> w0, for values of t < 2.12
    #   exp(8/t) + exp(9/t)   ==> w1, for values of t > 2.13
    num_sampled = 2
    num_resampled = 1
    num_classes = 2
    num_true = 1
    sampled_values = [0, 1], [[1.], [1.]], [1., 1.]
    resampled_values = [resampled], [[1.], [1.]], [1.]
    remove_accidental_hits = False
    with ops.Graph().as_default():
      weights = constant_op.constant(weights)
      biases = constant_op.constant([0., 0.])
      labels = constant_op.constant([[0], [1]], dtype=dtypes.int64)
      inputs = constant_op.constant(inputs)
      loss = sampling_ops.rank_sampled_softmax_loss(
          weights=weights,
          biases=biases,
          labels=labels,
          inputs=inputs,
          num_sampled=num_sampled,
          num_resampled=num_resampled,
          num_classes=num_classes,
          num_true=num_true,
          sampled_values=sampled_values,
          resampling_temperature=constant_op.constant(temperature),
          remove_accidental_hits=remove_accidental_hits,
          partition_strategy='div')
      loss_nn = nn.sampled_softmax_loss(
          weights=weights,
          biases=biases,
          labels=labels,
          inputs=inputs,
          num_sampled=num_resampled,
          num_classes=num_classes,
          num_true=num_true,
          sampled_values=resampled_values,
          remove_accidental_hits=remove_accidental_hits,
          partition_strategy='div')
      with self.test_session() as sess:
        loss_val = sess.run(loss)
        loss_nn_val = sess.run(loss_nn)

    self.assertAllClose(loss_val, loss_nn_val)
Beispiel #7
0
def rank_sampled_softmax_loss(weights,
                              biases,
                              labels,
                              inputs,
                              num_sampled,
                              num_resampled,
                              num_classes,
                              num_true,
                              sampled_values,
                              resampling_temperature,
                              remove_accidental_hits,
                              partition_strategy,
                              name=None):
  """Computes softmax loss using rank-based adaptive resampling.

  This has been shown to improve rank loss after training compared to
  `tf.nn.sampled_softmax_loss`. For a description of the algorithm and some
  experimental results, please see: [TAPAS: Two-pass Approximate Adaptive
  Sampling for Softmax](https://arxiv.org/abs/1707.03073).

  Sampling follows two phases:
  * In the first phase, `num_sampled` classes are selected using
    `tf.nn.learned_unigram_candidate_sampler` or supplied `sampled_values`.
    The logits are calculated on those sampled classes. This phases is
    similar to `tf.nn.sampled_softmax_loss`.
  * In the second phase, the `num_resampled` classes with highest predicted
    probability are kept. Probabilities are
    `LogSumExp(logits / resampling_temperature)`, where the sum is over
    `inputs`.

  The `resampling_temperature` parameter controls the "adaptiveness" of the
  resampling. At lower temperatures, resampling is more adaptive because it
  picks more candidates close to the predicted classes. A common strategy is
  to decrease the temperature as training proceeds.

  See `tf.nn.sampled_softmax_loss` for more documentation on sampling and
  for typical default values for some of the parameters.

  This operation is for training only. It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = rank_sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  Args:
    weights: A `Tensor` or `PartitionedVariable` of shape `[num_classes, dim]`,
        or a list of `Tensor` objects whose concatenation along dimension 0
        has shape [num_classes, dim]. The (possibly-sharded) class embeddings.
    biases: A `Tensor` or `PartitionedVariable` of shape `[num_classes]`.
        The (possibly-sharded) class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes. Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
        activations of the input network.
    num_sampled: An `int`. The number of classes to randomly sample per batch.
    num_resampled: An `int`. The number of classes to select from the
        `num_sampled` classes using the adaptive resampling algorithm. Must be
        less than `num_sampled`.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        If None, default to `nn.learned_unigram_candidate_sampler`.
    resampling_temperature: A scalar `Tensor` with the temperature parameter
        for the adaptive resampling algorithm.
    remove_accidental_hits: A `bool`. Whether to remove "accidental hits"
        where a sampled class equals one of the target classes.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  Raises:
    ValueError: If `num_sampled <= num_resampled`.
  """
  if num_sampled > num_classes:
    raise ValueError("num_sampled ({}) cannot be greater than num_classes ({})".
                     format(num_sampled, num_classes))
  if num_sampled <= num_resampled:
    raise ValueError("num_resampled ({}) must be less than num_sampled ({})".
                     format(num_resampled, num_sampled))
  if partition_strategy not in ("div", "mod"):
    raise ValueError(
        "unsupported partition_strategy ({})".format(partition_strategy))
  with ops.name_scope(name, "rank_sampled_softmax_loss", [
      weights, biases, labels, inputs, sampled_values, resampling_temperature
  ]) as name:
    if not sampled_values:
      sampled_values = nn.learned_unigram_candidate_sampler(
          true_classes=labels,
          num_true=num_true,
          num_sampled=num_sampled,
          unique=True,
          range_max=num_classes)
    # From sampled_values, select the top num_resampled values using the
    # adaptive rank resampling strategy.
    resampled_values = _rank_resample(weights, biases, inputs, sampled_values,
                                      num_resampled, resampling_temperature,
                                      partition_strategy)
    return nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        num_sampled=num_resampled,
        num_classes=num_classes,
        num_true=num_true,
        sampled_values=resampled_values,
        remove_accidental_hits=remove_accidental_hits,
        partition_strategy=partition_strategy,
        name=name)
Beispiel #8
0
def rank_sampled_softmax_loss(weights,
                              biases,
                              labels,
                              inputs,
                              num_sampled,
                              num_resampled,
                              num_classes,
                              num_true,
                              sampled_values,
                              resampling_temperature,
                              remove_accidental_hits,
                              partition_strategy,
                              name=None):
    """Computes softmax loss using rank-based adaptive resampling.

  This has been shown to improve rank loss after training compared to
  `tf.nn.sampled_softmax_loss`. For a description of the algorithm and some
  experimental results, please see: [TAPAS: Two-pass Approximate Adaptive
  Sampling for Softmax](https://arxiv.org/abs/1707.03073).

  Sampling follows two phases:
  * In the first phase, `num_sampled` classes are selected using
    `tf.nn.learned_unigram_candidate_sampler` or supplied `sampled_values`.
    The logits are calculated on those sampled classes. This phases is
    similar to `tf.nn.sampled_softmax_loss`.
  * In the second phase, the `num_resampled` classes with highest predicted
    probability are kept. Probabilities are
    `LogSumExp(logits / resampling_temperature)`, where the sum is over
    `inputs`.

  The `resampling_temperature` parameter controls the "adaptiveness" of the
  resampling. At lower temperatures, resampling is more adaptive because it
  picks more candidates close to the predicted classes. A common strategy is
  to decrease the temperature as training proceeds.

  See `tf.nn.sampled_softmax_loss` for more documentation on sampling and
  for typical default values for some of the parameters.

  This operation is for training only. It is generally an underestimate of
  the full softmax loss.

  A common use case is to use this method for training, and calculate the full
  softmax loss for evaluation or inference. In this case, you must set
  `partition_strategy="div"` for the two losses to be consistent, as in the
  following example:

  ```python
  if mode == "train":
    loss = rank_sampled_softmax_loss(
        weights=weights,
        biases=biases,
        labels=labels,
        inputs=inputs,
        ...,
        partition_strategy="div")
  elif mode == "eval":
    logits = tf.matmul(inputs, tf.transpose(weights))
    logits = tf.nn.bias_add(logits, biases)
    labels_one_hot = tf.one_hot(labels, n_classes)
    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits)
  ```

  Args:
    weights: A `Tensor` or `PartitionedVariable` of shape `[num_classes, dim]`,
        or a list of `Tensor` objects whose concatenation along dimension 0
        has shape [num_classes, dim]. The (possibly-sharded) class embeddings.
    biases: A `Tensor` or `PartitionedVariable` of shape `[num_classes]`.
        The (possibly-sharded) class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes. Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`. The forward
        activations of the input network.
    num_sampled: An `int`. The number of classes to randomly sample per batch.
    num_resampled: An `int`. The number of classes to select from the
        `num_sampled` classes using the adaptive resampling algorithm. Must be
        less than `num_sampled`.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        If None, default to `nn.learned_unigram_candidate_sampler`.
    resampling_temperature: A scalar `Tensor` with the temperature parameter
        for the adaptive resampling algorithm.
    remove_accidental_hits: A `bool`. Whether to remove "accidental hits"
        where a sampled class equals one of the target classes.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).

  Returns:
    A `batch_size` 1-D tensor of per-example sampled softmax losses.

  Raises:
    ValueError: If `num_sampled <= num_resampled`.
  """
    if num_sampled > num_classes:
        raise ValueError(
            "num_sampled ({}) cannot be greater than num_classes ({})".format(
                num_sampled, num_classes))
    if num_sampled <= num_resampled:
        raise ValueError(
            "num_resampled ({}) must be less than num_sampled ({})".format(
                num_resampled, num_sampled))
    if partition_strategy not in ("div", "mod"):
        raise ValueError(
            "unsupported partition_strategy ({})".format(partition_strategy))
    with ops.name_scope(name, "rank_sampled_softmax_loss", [
            weights, biases, labels, inputs, sampled_values,
            resampling_temperature
    ]) as name:
        if not sampled_values:
            sampled_values = nn.learned_unigram_candidate_sampler(
                true_classes=labels,
                num_true=num_true,
                num_sampled=num_sampled,
                unique=True,
                range_max=num_classes)
        # From sampled_values, select the top num_resampled values using the
        # adaptive rank resampling strategy.
        resampled_values = _rank_resample(weights, biases, inputs,
                                          sampled_values, num_resampled,
                                          resampling_temperature,
                                          partition_strategy)
        return nn.sampled_softmax_loss(
            weights=weights,
            biases=biases,
            labels=labels,
            inputs=inputs,
            num_sampled=num_resampled,
            num_classes=num_classes,
            num_true=num_true,
            sampled_values=resampled_values,
            remove_accidental_hits=remove_accidental_hits,
            partition_strategy=partition_strategy,
            name=name)
def dnn_sampled_softmax_classifier_model_fn(features, target_indices,
                                            mode, params):
  """model_fn that uses candidate sampling.

  Args:
    features: Single Tensor or dict of Tensor (depends on data passed to `fit`)
    target_indices: A single Tensor of shape [batch_size, n_labels] containing
      the target indices.
    mode: Represents if this training, evaluation or prediction. See `ModeKeys`.
    params: A dict of hyperparameters that are listed below.
      hidden_units- List of hidden units per layer. All layers are fully
        connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
        has 32.
      feature_columns- An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
      n_classes- number of target classes. It must be greater than 2.
      n_samples- number of sample target classes. Needs to be tuned - A good
        starting point could be 2% of n_classes.
      n_labels- number of labels in each example.
      top_k- The number of classes to predict.
      optimizer- An instance of `tf.Optimizer` used to train the model. If
        `None`, will use an Adagrad optimizer.
      dropout- When not `None`, the probability we will drop out a given
        coordinate.
      gradient_clip_norm- A float > 0. If provided, gradients are
        clipped to their global norm with this clipping ratio. See
        tf.clip_by_global_norm for more details.
      num_ps_replicas- The number of parameter server replicas.

  Returns:
    predictions: A single Tensor or a dict of Tensors.
    loss: A scalar containing the loss of the step.
    train_op: The op for training.
  """

  hidden_units = params["hidden_units"]
  feature_columns = params["feature_columns"]
  n_classes = params["n_classes"]
  n_samples = params["n_samples"]
  n_labels = params["n_labels"]
  top_k = params["top_k"]
  optimizer = params["optimizer"]
  dropout = params["dropout"]
  gradient_clip_norm = params["gradient_clip_norm"]
  num_ps_replicas = params["num_ps_replicas"]

  parent_scope = "dnn_ss"

  # Setup the input layer partitioner.
  input_layer_partitioner = (
      partitioned_variables.min_max_variable_partitioner(
          max_partitions=num_ps_replicas,
          min_slice_size=64 << 20))

  # Create the input layer.
  with variable_scope.variable_scope(
      parent_scope + "/input_from_feature_columns",
      features.values(),
      partitioner=input_layer_partitioner) as scope:
    net = layers.input_from_feature_columns(
        features,
        feature_columns,
        weight_collections=[parent_scope],
        scope=scope)

  # Setup the hidden layer partitioner.
  hidden_layer_partitioner = (
      partitioned_variables.min_max_variable_partitioner(
          max_partitions=num_ps_replicas))

  final_hidden_layer_dim = None
  # Create hidden layers using fully_connected.
  for layer_id, num_hidden_units in enumerate(hidden_units):
    with variable_scope.variable_scope(
        parent_scope + "/hiddenlayer_%d" % layer_id, [net],
        partitioner=hidden_layer_partitioner) as scope:
      net = layers.fully_connected(net,
                                   num_hidden_units,
                                   variables_collections=[parent_scope],
                                   scope=scope)
      final_hidden_layer_dim = num_hidden_units
      # Add dropout if it is enabled.
      if dropout is not None and mode == estimator.ModeKeys.TRAIN:
        net = layers.dropout(net, keep_prob=(1.0 - dropout))

  # Create the weights and biases for the logit layer.
  with variable_scope.variable_scope(
      parent_scope + "/logits", [net],
      partitioner=hidden_layer_partitioner) as scope:
    dtype = net.dtype.base_dtype
    weights_shape = [n_classes, final_hidden_layer_dim]
    weights = variables.model_variable(
        "weights",
        shape=weights_shape,
        dtype=dtype,
        initializer=initializers.xavier_initializer(),
        trainable=True,
        collections=[parent_scope])
    biases = variables.model_variable(
        "biases",
        shape=[n_classes,],
        dtype=dtype,
        initializer=init_ops.zeros_initializer,
        trainable=True,
        collections=[parent_scope])

  if mode == estimator.ModeKeys.TRAIN:
    # Call the candidate sampling APIs and calculate the loss.
    sampled_values = nn.learned_unigram_candidate_sampler(
        true_classes=math_ops.to_int64(target_indices),
        num_true=n_labels,
        num_sampled=n_samples,
        unique=True,
        range_max=n_classes)

    sampled_softmax_loss = nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        inputs=net,
        labels=math_ops.to_int64(target_indices),
        num_sampled=n_samples,
        num_classes=n_classes,
        num_true=n_labels,
        sampled_values=sampled_values)

    loss = math_ops.reduce_mean(sampled_softmax_loss, name="loss")

    train_op = optimizers.optimize_loss(
        loss=loss, global_step=contrib_framework.get_global_step(),
        learning_rate=_DEFAULT_LEARNING_RATE,
        optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm,
        name=parent_scope)
    return None, loss, train_op

  elif mode == estimator.ModeKeys.EVAL:
    logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)),
                         biases)
    predictions = {}
    predictions[_PROBABILITIES] = nn.softmax(logits)
    predictions[_CLASSES] = math_ops.argmax(logits, 1)
    _, predictions[_TOP_K] = nn.top_k(logits, top_k)

    # Since the targets have multiple labels, setup the target probabilities
    # as 1.0/n_labels for each of the labels.
    target_one_hot = array_ops.one_hot(
        indices=target_indices,
        depth=n_classes,
        on_value=1.0 / n_labels)
    target_one_hot = math_ops.reduce_sum(
        input_tensor=target_one_hot,
        reduction_indices=[1])

    loss = math_ops.reduce_mean(
        nn.softmax_cross_entropy_with_logits(logits, target_one_hot))

    return predictions, loss, None

  elif mode == estimator.ModeKeys.INFER:
    logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)),
                         biases)
    predictions = {}
    predictions[_PROBABILITIES] = nn.softmax(logits)
    predictions[_CLASSES] = math_ops.argmax(logits, 1)
    _, predictions[_TOP_K] = nn.top_k(logits, top_k)

    return predictions, None, None
Beispiel #10
0
def dnn_sampled_softmax_classifier_model_fn(features, target_indices,
                                            mode, params):
  """model_fn that uses candidate sampling.

  Args:
    features: Single Tensor or dict of Tensor (depends on data passed to `fit`)
    target_indices: A single Tensor of shape [batch_size, n_labels] containing
      the target indices.
    mode: Represents if this training, evaluation or prediction. See `ModeKeys`.
    params: A dict of hyperparameters that are listed below.
      hidden_units- List of hidden units per layer. All layers are fully
        connected. Ex. `[64, 32]` means first layer has 64 nodes and second one
        has 32.
      feature_columns- An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `FeatureColumn`.
      n_classes- number of target classes. It must be greater than 2.
      n_samples- number of sample target classes. Needs to be tuned - A good
        starting point could be 2% of n_classes.
      n_labels- number of labels in each example.
      top_k- The number of classes to predict.
      optimizer- An instance of `tf.Optimizer` used to train the model. If
        `None`, will use an Adagrad optimizer.
      dropout- When not `None`, the probability we will drop out a given
        coordinate.
      gradient_clip_norm- A float > 0. If provided, gradients are
        clipped to their global norm with this clipping ratio. See
        tf.clip_by_global_norm for more details.
      num_ps_replicas- The number of parameter server replicas.

  Returns:
    predictions: A single Tensor or a dict of Tensors.
    loss: A scalar containing the loss of the step.
    train_op: The op for training.
  """

  hidden_units = params["hidden_units"]
  feature_columns = params["feature_columns"]
  n_classes = params["n_classes"]
  n_samples = params["n_samples"]
  n_labels = params["n_labels"]
  top_k = params["top_k"]
  optimizer = params["optimizer"]
  dropout = params["dropout"]
  gradient_clip_norm = params["gradient_clip_norm"]
  num_ps_replicas = params["num_ps_replicas"]

  parent_scope = "dnn_ss"

  # Setup the input layer partitioner.
  input_layer_partitioner = (
      partitioned_variables.min_max_variable_partitioner(
          max_partitions=num_ps_replicas,
          min_slice_size=64 << 20))

  # Create the input layer.
  with variable_scope.variable_scope(
      parent_scope + "/input_from_feature_columns",
      features.values(),
      partitioner=input_layer_partitioner) as scope:
    net = layers.input_from_feature_columns(
        features,
        feature_columns,
        weight_collections=[parent_scope],
        scope=scope)

  # Setup the hidden layer partitioner.
  hidden_layer_partitioner = (
      partitioned_variables.min_max_variable_partitioner(
          max_partitions=num_ps_replicas))

  final_hidden_layer_dim = None
  # Create hidden layers using fully_connected.
  for layer_id, num_hidden_units in enumerate(hidden_units):
    with variable_scope.variable_scope(
        parent_scope + "/hiddenlayer_%d" % layer_id, [net],
        partitioner=hidden_layer_partitioner) as scope:
      net = layers.fully_connected(net,
                                   num_hidden_units,
                                   variables_collections=[parent_scope],
                                   scope=scope)
      final_hidden_layer_dim = num_hidden_units
      # Add dropout if it is enabled.
      if dropout is not None and mode == estimator.ModeKeys.TRAIN:
        net = layers.dropout(net, keep_prob=(1.0 - dropout))

  # Create the weights and biases for the logit layer.
  with variable_scope.variable_scope(
      parent_scope + "/logits", [net],
      partitioner=hidden_layer_partitioner) as scope:
    dtype = net.dtype.base_dtype
    weights_shape = [n_classes, final_hidden_layer_dim]
    weights = variables.model_variable(
        "weights",
        shape=weights_shape,
        dtype=dtype,
        initializer=initializers.xavier_initializer(),
        trainable=True,
        collections=[parent_scope])
    biases = variables.model_variable(
        "biases",
        shape=[n_classes,],
        dtype=dtype,
        initializer=init_ops.zeros_initializer,
        trainable=True,
        collections=[parent_scope])

  if mode == estimator.ModeKeys.TRAIN:
    # Call the candidate sampling APIs and calculate the loss.
    sampled_values = nn.learned_unigram_candidate_sampler(
        true_classes=math_ops.to_int64(target_indices),
        num_true=n_labels,
        num_sampled=n_samples,
        unique=True,
        range_max=n_classes)

    sampled_softmax_loss = nn.sampled_softmax_loss(
        weights=weights,
        biases=biases,
        inputs=net,
        labels=math_ops.to_int64(target_indices),
        num_sampled=n_samples,
        num_classes=n_classes,
        num_true=n_labels,
        sampled_values=sampled_values)

    loss = math_ops.reduce_mean(sampled_softmax_loss, name="loss")

    train_op = optimizers.optimize_loss(
        loss=loss, global_step=contrib_framework.get_global_step(),
        learning_rate=_DEFAULT_LEARNING_RATE,
        optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm,
        name=parent_scope)
    return None, loss, train_op

  elif mode == estimator.ModeKeys.EVAL:
    logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)),
                         biases)
    predictions = {}
    predictions[_PROBABILITIES] = nn.softmax(logits)
    predictions[_CLASSES] = math_ops.argmax(logits, 1)
    _, predictions[_TOP_K] = nn.top_k(logits, top_k)

    # Since the targets have multiple labels, setup the target probabilities
    # as 1.0/n_labels for each of the labels.
    target_one_hot = array_ops.one_hot(
        indices=target_indices,
        depth=n_classes,
        on_value=1.0 / n_labels)
    target_one_hot = math_ops.reduce_sum(
        input_tensor=target_one_hot,
        reduction_indices=[1])

    loss = math_ops.reduce_mean(
        nn.softmax_cross_entropy_with_logits(logits, target_one_hot))

    return predictions, loss, None

  elif mode == estimator.ModeKeys.INFER:
    logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)),
                         biases)
    predictions = {}
    predictions[_PROBABILITIES] = nn.softmax(logits)
    predictions[_CLASSES] = math_ops.argmax(logits, 1)
    _, predictions[_TOP_K] = nn.top_k(logits, top_k)

    return predictions, None, None