def evaluate_multiclass(self, predictions, weights): """Evaluates the multiclass zero-one loss on the given predictions. Given a rank-2 `Tensor` of predictions with shape (n, k), where n is the number of examples and k is the number of classes, and another rank-2 `Tensor` of weights with shape (m, k), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python maximum_prediction[i] = max(predictions[i, :]) maximum_weights[i] = [ weights[i, j] for j in range(k) if predictions[i, j] >= maximum_prediction[i]] zero_one_loss[i] = sum(maximum_weights[i]) / len(maximum_weights[i]) ``` Args: predictions: a `Tensor` of shape (n, k), where n is the number of examples and k is the number of classes. weights: a `Tensor` of shape (m, k), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the zero-one losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" and "weights" have different numbers of columns (i.e. if the number of classes is inconsistent). """ num_classes = helpers.get_num_columns_of_2d_tensor( predictions, name="multiclass predictions") weights_num_classes = helpers.get_num_columns_of_2d_tensor( weights, name="weights") if weights_num_classes != num_classes: raise ValueError( "weights must have the same number of columns as " "predictions ({} vs. {}): did you specify num_classes " "correctly when you created your context?".format( weights_num_classes, num_classes)) dtype = predictions.dtype.base_dtype if not dtype.is_floating: raise TypeError("multiclass predictions must be floating-point") thresholded_predictions = tf.cast( predictions >= tf.reduce_max(predictions, axis=1, keepdims=True), dtype=dtype) thresholded_predictions /= tf.reduce_sum(thresholded_predictions, axis=1, keepdims=True) return tf.reduce_sum(tf.cast(weights, dtype=dtype) * thresholded_predictions, axis=1)
def evaluate_multiclass(self, predictions, weights): """Evaluates the multiclass softmax loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python softmax_loss[i] = sum_j ( weights[i, j] * ( exp(predictions[i, j]) / sum_k exp(predictions[i, k]) ) ) ``` Args: predictions: a `Tensor` of shape (n, k), where n is the number of examples and k is the number of classes. weights: a `Tensor` of shape (m, k), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the softmax losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" and "weights" have different numbers of columns (i.e. if the number of classes is inconsistent). """ num_classes = helpers.get_num_columns_of_2d_tensor( predictions, name="multiclass predictions") weights_num_classes = helpers.get_num_columns_of_2d_tensor( weights, name="weights") if weights_num_classes != num_classes: raise ValueError( "weights must have the same number of columns as " "predictions ({} vs. {}): did you specify num_classes " "correctly when you created your context?".format( weights_num_classes, num_classes)) dtype = predictions.dtype.base_dtype if not dtype.is_floating: raise TypeError("multiclass predictions must be floating-point") maximum_predictions = tf.reduce_max(predictions, axis=1, keepdims=True) numerators = tf.exp(predictions - maximum_predictions) denominators = tf.reduce_sum(numerators, axis=1, keepdims=True) probabilities = numerators / denominators weights = tf.cast(weights, dtype=dtype) return tf.reduce_sum(weights * probabilities, axis=1)
def test_get_num_columns_of_2d_tensor(self): """Tests the "get_num_columns_of_2d_tensor" function.""" # Trying to get the number of columns from a non-tensor should fail. with self.assertRaises(TypeError): _ = helpers.get_num_columns_of_2d_tensor([[1, 2], [3, 4]]) # Trying to get the number of columns from a rank-1 tensor should fail. tensor = tf.convert_to_tensor([1, 2, 3, 4]) with self.assertRaises(ValueError): _ = helpers.get_num_columns_of_2d_tensor(tensor) # Make sure that we successfully get the number of columns. tensor = tf.convert_to_tensor([[1, 2, 3]]) self.assertEqual(3, helpers.get_num_columns_of_2d_tensor(tensor))
def evaluate_binary_classification(self, predictions, weights): """Evaluates the hinge loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python hinge_loss[i] = constant_weights[i] + (weights[i, 0] - constant_weights[i]) * max{0, margin + predictions[i]} + (weights[i, 1] - constant_weights[i]) * max{0, margin - predictions[i]} ``` where constant_weights[i] = min{weights[i, 0], weights[i, 1]} contains the minimum weights. You can think of weights[:, 0] as being the per-example costs associated with making a positive prediction, and weights[:, 1] as those for a negative prediction. Args: predictions: a `Tensor` of shape (n,), where n is the number of examples. weights: a `Tensor` of shape (m, 2), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the hinge losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" is not rank-1, or "weights" is not a rank-2 `Tensor` with exactly two columns. """ predictions = _convert_to_binary_classification_predictions( predictions) columns = helpers.get_num_columns_of_2d_tensor(weights, name="weights") if columns != 2: raise ValueError("weights must have two columns") dtype = predictions.dtype.base_dtype zero = tf.zeros(1, dtype=dtype) positive_weights = tf.cast(weights[:, 0], dtype=dtype) negative_weights = tf.cast(weights[:, 1], dtype=dtype) constant_weights = tf.minimum(positive_weights, negative_weights) positive_weights -= constant_weights negative_weights -= constant_weights is_positive = tf.maximum(zero, self._margin + predictions) is_negative = tf.maximum(zero, self._margin - predictions) return constant_weights + (positive_weights * is_positive + negative_weights * is_negative)
def evaluate_binary_classification(self, predictions, weights): """Evaluates the zero-one loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python zero_one_loss[i] = weights[i, 0] * 1{predictions[i] > 0} + 0.5 * (weights[i, 0] + weights[i, 1]) * 1{predictions[i] == 0} + weights[i, 1] * 1{predictions[i] < 0} ``` where 1{} is an indicator function. You can think of weights[:, 0] as being the per-example costs associated with making a positive prediction, and weights[:, 1] as those for a negative prediction. Args: predictions: a `Tensor` of shape (n,), where n is the number of examples. weights: a `Tensor` of shape (m, 2), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the zero-one losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" is not rank-1, or "weights" is not a rank-2 `Tensor` with exactly two columns. """ predictions = _convert_to_binary_classification_predictions( predictions) columns = helpers.get_num_columns_of_2d_tensor(weights, name="weights") if columns != 2: raise ValueError("weights must have two columns") dtype = predictions.dtype.base_dtype positive_weights = tf.cast(weights[:, 0], dtype=dtype) negative_weights = tf.cast(weights[:, 1], dtype=dtype) sign = tf.sign(predictions) return 0.5 * ((positive_weights + negative_weights) + sign * (positive_weights - negative_weights))
def evaluate_binary_classification(self, predictions, weights): """Evaluates the cross-entropy loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python softmax_cross_entropy_loss[i] = ( constant_weights[i] + (weights[i, 0] - constant_weights[i]) * log(1 + exp(predictions[i]) + (weights[i, 1] - constant_weights[i]) * log(1 + exp(-predictions[i]) ) ``` where constant_weights[i] = min{weights[i, 0], weights[i, 1]} contains the minimum weights. You can think of weights[:, 0] as being the per-example costs associated with making a positive prediction, and weights[:, 1] as those for a negative prediction. Args: predictions: a `Tensor` of shape (n,), where n is the number of examples. weights: a `Tensor` of shape (m, 2), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the softmax cross-entropy losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" is not rank-1, or "weights" is not a rank-2 `Tensor` with exactly two columns. """ predictions = _convert_to_binary_classification_predictions( predictions) columns = helpers.get_num_columns_of_2d_tensor(weights, name="weights") if columns != 2: raise ValueError("weights must have two columns") dtype = predictions.dtype.base_dtype zeros = tf.zeros_like(predictions) positive_weights = tf.cast(weights[:, 0], dtype=dtype) negative_weights = tf.cast(weights[:, 1], dtype=dtype) constant_weights = tf.minimum(positive_weights, negative_weights) positive_weights -= constant_weights negative_weights -= constant_weights # We use tf.where() instead of tf.abs() and tf.maximum() since, if we # didn't, then we would have zero gradients whenever predictions=0, with the # consequence that optimization could get "stuck". condition = (predictions <= zeros) absolute_predictions = tf.where(condition, -predictions, predictions) intermediate = tf.math.log(1 + tf.exp(-absolute_predictions)) # Notice that: # is_positive = log(1 + exp(-|predictions|)) + max{0, predictions} # is_positive = # log(1 + exp(predictions)) if (predictions <= 0) # log(1 + exp(-predictions)) + predictions if (predictions >= 0) # is_positive = log(1 + exp(predictions)) # Likewise: # is_negative = log(1 + exp(-predictions)) # The reason for representing these in terms of "intermediate" is to improve # numerical accuracy. is_positive = intermediate + tf.where(condition, zeros, predictions) is_negative = intermediate + tf.where(condition, -predictions, zeros) return constant_weights + (positive_weights * is_positive + negative_weights * is_negative)
def evaluate_multiclass(self, predictions, weights): """Evaluates the multiclass cross-entropy loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python max_weight[i] = max_j weights[i, j] softmax_cross_entropy_loss[i] = max_weight[i] - sum_j ( (max_weight[i] - weights[i, j]) * ( 1 + log( exp(predictions[i, j]) / sum_k exp(predictions[i, k]) ) ) ) ``` The reason this formulation was chosen is that it can be derived from the softmax loss using the inequality -log(p) >= 1-p. Indeed, the difference between the two losses is entirely due to the slop in this inequality. In particular, it satisfies the following properties: 1. It's shift invariant: adding a constant to every weight will shift the loss by the same constant. 2. It's scale invariant: multiplying every weight by a constant will scale the loss by the same constant. 3. When there are only two classes, it's equivalent to the binary softmax cross-entropy loss implemented in evaluate_binary_classification(). 4. When the weights represent an expected misclassification rate (i.e. weights[i, j] >= 0, and sum_j weights[i, j] = 1), it's equivalent to the usual multiclass softmax cross-entropy misclassification loss. 5. It's convex in the predictions, and upper bounds the multiclass softmax loss. Args: predictions: a `Tensor` of shape (n, k), where n is the number of examples and k is the number of classes. weights: a `Tensor` of shape (m, k), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the softmax cross-entropy losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" and "weights" have different numbers of columns (i.e. if the number of classes is inconsistent). """ num_classes = helpers.get_num_columns_of_2d_tensor( predictions, name="multiclass predictions") weights_num_classes = helpers.get_num_columns_of_2d_tensor( weights, name="weights") if weights_num_classes != num_classes: raise ValueError( "weights must have the same number of columns as " "predictions ({} vs. {}): did you specify num_classes " "correctly when you created your context?".format( weights_num_classes, num_classes)) dtype = predictions.dtype.base_dtype if not dtype.is_floating: raise TypeError("multiclass predictions must be floating-point") maximum_predictions = tf.reduce_max(predictions, axis=1, keepdims=True) numerators = tf.exp(predictions - maximum_predictions) denominators = tf.reduce_sum(numerators, axis=1, keepdims=True) log_probabilities = tf.math.log(numerators / denominators) weights = tf.cast(weights, dtype=dtype) maximum_weights = tf.reduce_max(weights, axis=1, keepdims=True) return (tf.squeeze(maximum_weights) - tf.reduce_sum( (maximum_weights - weights) * (1 + log_probabilities), axis=1))
def evaluate_multiclass(self, predictions, weights): """Evaluates the multiclass hinge loss on the given predictions. Given a rank-1 `Tensor` of predictions with shape (n,), where n is the number of examples, and a rank-2 `Tensor` of weights with shape (m, 2), where m is broadcastable to n, this method will return a `Tensor` of shape (n,) where the ith element is: ```python hinge_loss[i] = weights[i, 0] + sum_{j=0}^{num_classes - 2} ( (weights[i, j+1] - weights[i, j]) * max_{l=j+1}^{num_classes-1} max{0, margin + predictions[i, l] - mean_{k=0}^j predictions[i, k]} ) ``` where we've assumed (without loss of generality) that the weights and predictions are ordered in such a way that weights[i, j] <= weights[i, j+1]. In the implementation, of course, we cannot simply assume this, and actually perform a sort. This is admittedly a somewhat strange-seeming formulation, and it's complicated and expensive to implement. The reason it was chosen is that it satisfies the following properties: 1. It's shift invariant: adding a constant to every weight will shift the loss by the same constant. 2. It's scale invariant: multiplying every weight by a constant will scale the loss by the same constant. 3. When there are only two classes, it's equivalent to the binary hinge loss implemented in evaluate_binary_classification(). 4. When the weights represent a misclassification rate (i.e. weights[i, 0] = 0 and weights[i, j] = 1 for i > 0, assuming the weights are sorted), it's equivalent to the usual multiclass hinge misclassification loss. 5. It's convex in the predictions, and upper bounds the multiclass 0-1 loss when margin >= 1. Args: predictions: a `Tensor` of shape (n, k), where n is the number of examples and k is the number of classes. weights: a `Tensor` of shape (m, k), where m is broadcastable to n. This `Tensor` is *not* necessarily non-negative. Returns: A `Tensor` of shape (n,) and dtype=predictions.dtype, containing the hinge losses for each example. Raises: TypeError: if "predictions" is not a floating-point `Tensor`, or "weights" is not a `Tensor`. ValueError: if "predictions" and "weights" have different numbers of columns (i.e. if the number of classes is inconsistent). """ num_classes = helpers.get_num_columns_of_2d_tensor( predictions, name="multiclass predictions") weights_num_classes = helpers.get_num_columns_of_2d_tensor( weights, name="weights") if weights_num_classes != num_classes: raise ValueError( "weights must have the same number of columns as " "predictions ({} vs. {}): did you specify num_classes " "correctly when you created your context?".format( weights_num_classes, num_classes)) dtype = predictions.dtype.base_dtype if not dtype.is_floating: raise TypeError("multiclass predictions must be floating-point") zero = tf.zeros(1, dtype=dtype) weights_rows = tf.shape(weights)[0] predictions_rows = tf.shape(predictions)[0] # We start out by finding a permutation for each row that will cause the # weights to be nondecreasing. weights_permutation = tf.argsort(weights, axis=1) # This won't work if predictions_rows isn't divisible by weights_rows # (tf.stack() below will fail), but we require weights to be broadcastable # to predictions (usually, weights_rows will either be 1, or equal to # predictions_rows). predictions_permutation = tf.tile(weights_permutation, [predictions_rows / weights_rows, 1]) # First we create a Tensor of shape [weights_rows, num_classes, 2], for # which: # weights_indices[i, j, 0] = i # weights_indices[i, j, 1] = weights_permutation[j] # Next, we use gather_nd to re-organize the weights such that: # new_weights[i, j] = old_weights[i, weights_permutation[j]] weights_iota = tf.range(weights_rows) weights_iota = tf.expand_dims(weights_iota, axis=-1) weights_iota = tf.tile(weights_iota, [1, num_classes]) weights_indices = tf.stack([weights_iota, weights_permutation], axis=2) weights = tf.gather_nd(tf.cast(weights, dtype=dtype), weights_indices) # Next we create a Tensor of shape [predictions_rows, num_classes, 2], for # which: # predictions_indices[i, j, 0] = i # predictions_indices[i, j, 1] = predictions_permutation[j] # Next, we use gather_nd to re-organize the predictions such that: # new_predictions[i, j] = old_predictions[i, predictions_permutation[j]] predictions_iota = tf.range(predictions_rows) predictions_iota = tf.expand_dims(predictions_iota, axis=-1) predictions_iota = tf.tile(predictions_iota, [1, num_classes]) predictions_indices = tf.stack( [predictions_iota, predictions_permutation], axis=2) predictions = tf.gather_nd(predictions, predictions_indices) # At this point, every row of weights and predictions has been sorted in # such a way that the weights are nondecreasing. We wish to calculate the # following: # result[i] = weights[i, 0] + \sum_{j=0}^{num_classes - 2} ( # (weights[i, j+1] - weights[i, j]) * max_{l=j+1}^{num_classes-1} # max{0, margin + predictions[i, l] - mean_{k=0}^j predictions[i, k]} # ) # Notice that the innermost max is a hinge. result = weights[:, 0] for ii in xrange(num_classes - 1): scale = weights[:, ii + 1] - weights[:, ii] # The "included" predictions are those in the above max over l, and the # "excluded" predictions are those in the above mean over k. included = predictions[:, (ii + 1):num_classes] included = tf.reduce_max(included, axis=1) excluded = predictions[:, 0:(ii + 1)] excluded = tf.reduce_mean(excluded, axis=1) result += scale * tf.maximum(zero, self._margin + included - excluded) return result