def test_sort_by_scores_3d(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] example_feature = [[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]], [[10., 20., 30.], [40., 50., 60.], [70., 80., 90.]]] with tf.compat.v1.Session() as sess: sorted_example_feature = sess.run( utils.sort_by_scores(scores, [example_feature])[0]) self.assertAllEqual( sorted_example_feature, [[[4., 5., 6.], [7., 8., 9.], [1., 2., 3.]], [[70., 80., 90.], [40., 50., 60.], [10., 20., 30.]]]) sorted_example_feature = sess.run( utils.sort_by_scores(scores, [example_feature], topn=2)[0]) self.assertAllEqual(sorted_example_feature, [[[4., 5., 6.], [7., 8., 9.]], [[70., 80., 90.], [40., 50., 60.]]]) sorted_example_feature = sess.run( utils.sort_by_scores([scores[0]], [[example_feature[0]]])[0]) self.assertAllEqual( sorted_example_feature, [[[4., 5., 6.], [7., 8., 9.], [1., 2., 3.]]])
def test_sort_by_scores_shuffle_ties(self): with tf.Graph().as_default(): tf.compat.v1.set_random_seed(589) scores = [[2., 1., 1.]] names = [['a', 'b', 'c']] with tf.compat.v1.Session() as sess: sorted_names = sess.run( utils.sort_by_scores(scores, [names], shuffle_ties=False))[0] self.assertAllEqual(sorted_names, [[b'a', b'b', b'c']]) sorted_names = sess.run( utils.sort_by_scores(scores, [names], shuffle_ties=True, seed=2))[0] self.assertAllEqual(sorted_names, [[b'a', b'c', b'b']])
def _sort_and_normalize(labels, logits, weights=None): """Sorts `labels` and `logits` and normalize `weights`. Args: labels: A `Tensor` of the same shape as `logits` representing graded relevance. logits: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding item. weights: A scalar, a `Tensor` with shape [batch_size, 1], or a `Tensor` with the same shape as `labels`. Returns: A tuple of (sorted_labels, sorted_logits, sorted_weights). """ labels = tf.convert_to_tensor(value=labels) logits = tf.convert_to_tensor(value=logits) logits.get_shape().assert_has_rank(2) logits.get_shape().assert_is_compatible_with(labels.get_shape()) weights = 1.0 if weights is None else tf.convert_to_tensor(value=weights) weights = tf.ones_like(labels) * weights topn = tf.shape(input=logits)[1] # Only sort entries with valid labels that are >= 0. scores = tf.where( tf.greater_equal(labels, 0.), logits, -1e-6 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=1, keepdims=True)) sorted_labels, sorted_logits, sorted_weights = utils.sort_by_scores( scores, [labels, logits, weights], topn=topn) return sorted_labels, sorted_logits, sorted_weights
def _compute_impl(self, labels, predictions, weights, mask): """See `_RankingMetric`.""" topn = tf.shape(predictions)[1] if self._topn is None else self._topn # Relevance = 1.0 when labels >= 1.0. relevance = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) sorted_relevance, sorted_weights = utils.sort_by_scores( predictions, [relevance, weights], topn=topn, mask=mask) per_list_relevant_counts = tf.cumsum(sorted_relevance, axis=1) per_list_cutoffs = tf.cumsum(tf.ones_like(sorted_relevance), axis=1) per_list_precisions = tf.math.divide_no_nan(per_list_relevant_counts, per_list_cutoffs) total_precision = tf.reduce_sum(input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True) # Compute the total relevance regardless of self._topn. total_relevance = tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True) per_list_map = tf.math.divide_no_nan(total_precision, total_relevance) # per_list_weights are computed from the whole list to avoid the problem of # 0 when there is no relevant example in topn. per_list_weights = _per_example_weights_to_per_list_weights( weights, relevance) return per_list_map, per_list_weights
def compute_unreduced_loss(self, labels, logits): """See `_RankingLoss`.""" is_valid = utils.is_label_valid(labels) # Reset the invalid labels to 0 and reset the invalid logits to a logit with # ~= 0 contribution. labels = tf.compat.v1.where(is_valid, labels, tf.zeros_like(labels)) logits = tf.compat.v1.where(is_valid, logits, tf.math.log(_EPSILON) * tf.ones_like(logits)) scores = tf.compat.v1.where( is_valid, labels, tf.reduce_min(input_tensor=labels, axis=1, keepdims=True) - 1e-6 * tf.ones_like(labels)) # Use a fixed ops-level seed and the randomness is controlled by the # graph-level seed. sorted_labels, sorted_logits = utils.sort_by_scores( scores, [labels, logits], shuffle_ties=True, seed=37) raw_max = tf.reduce_max(input_tensor=sorted_logits, axis=1, keepdims=True) sorted_logits = sorted_logits - raw_max sums = tf.cumsum(tf.exp(sorted_logits), axis=1, reverse=True) sums = tf.math.log(sums) - sorted_logits if self._lambda_weight is not None and isinstance(self._lambda_weight, ListMLELambdaWeight): batch_size, list_size = tf.unstack(tf.shape(input=sorted_labels)) sums *= self._lambda_weight.individual_weights( sorted_labels, tf.tile(tf.expand_dims(tf.range(list_size) + 1, 0), [batch_size, 1])) negative_log_likelihood = tf.reduce_sum( input_tensor=sums, axis=1, keepdims=True) return negative_log_likelihood, tf.ones_like(negative_log_likelihood)
def average_relevance_position(labels, predictions, weights=None, name=None): """Computes average relevance position (ARP). This can also be named as average_relevance_rank, but this can be confusing with mean_reciprocal_rank in acronyms. This name is more distinguishing and has been used historically for binary relevance as average_click_position. Args: labels: A `Tensor` of the same shape as `predictions`. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. name: A string used as the name for this metric. Returns: A metric for the weighted average relevance position. """ with ops.name_scope(name, 'average_relevance_position', (labels, predictions, weights)): _, list_size = array_ops.unstack(array_ops.shape(predictions)) labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, list_size) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) relevance = sorted_labels * sorted_weights position = math_ops.to_float(math_ops.range(1, topn + 1)) # TODO(xuanhui): Consider to add a cap poistion topn + 1 when there is no # relevant examples. return math_ops.reduce_mean( position * array_ops.ones_like(relevance) * relevance)
def precision(labels, predictions, weights=None, topn=None, name=None): """Computes precision as weighted average of relevant examples. Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. name: A string used as the name for this metric. Returns: A metric for the weighted precision of the batch. """ with ops.name_scope(name, 'precision', (labels, predictions, weights)): labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, topn) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) # Relevance = 1.0 when labels >= 1.0. relevance = math_ops.to_float( math_ops.greater_equal( sorted_labels, 1.0)) per_list_precision = _safe_div( math_ops.reduce_sum(relevance * sorted_weights, 1, keepdims=True), math_ops.reduce_sum( array_ops.ones_like(relevance) * sorted_weights, 1, keepdims=True)) # per_list_weights are computed from the whole list to avoid the problem of # 0 when there is no relevant example in topn. per_list_weights = _per_example_weights_to_per_list_weights( weights, math_ops.to_float(math_ops.greater_equal(labels, 1.0))) return math_ops.reduce_mean(per_list_precision * per_list_weights)
def _per_list_precision(labels, predictions, weights, topn): """Computes the precision for each query in the batch. Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. Returns: A `Tensor` of size [batch_size, 1] containing the percision of each query respectively. """ sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) # Relevance = 1.0 when labels >= 1.0. relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) per_list_precision = tf.compat.v1.math.divide_no_nan( tf.reduce_sum( input_tensor=relevance * sorted_weights, axis=1, keepdims=True), tf.reduce_sum( input_tensor=tf.ones_like(relevance) * sorted_weights, axis=1, keepdims=True)) return per_list_precision
def discounted_cumulative_gain(labels, predictions, weights=None, topn=None, name=None): """Computes discounted cumulative gain (DCG). Args: labels: A `Tensor` of the same shape as `predictions`. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. name: A string used as the name for this metric. Returns: A metric for the weighted discounted cumulative gain of the batch. """ with ops.name_scope(name, 'discounted_cumulative_gain', (labels, predictions, weights)): labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, topn) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights) * math_ops.log1p(1.0) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=math_ops.pow(2.0, math_ops.to_float(labels)) - 1.0) return math_ops.reduce_mean( _safe_div(dcg, per_list_weights) * per_list_weights)
def compute_unreduced_loss(self, labels, logits, weights): """See `_RankingLoss`.""" is_label_valid = utils.is_label_valid(labels) # Reset the invalid labels to 0 and reset the invalid logits to a logit with # ~= 0 contribution. labels = tf.where(is_label_valid, labels, tf.zeros_like(labels)) logits = tf.where(is_label_valid, logits, tf.math.log(_EPSILON) * tf.ones_like(logits)) weights = 1.0 if weights is None else tf.convert_to_tensor( value=weights) weights = tf.squeeze(weights) # Shuffle labels and logits to add randomness to sort. shuffled_indices = utils.shuffle_valid_indices(is_label_valid, self._seed) shuffled_labels = tf.gather_nd(labels, shuffled_indices) shuffled_logits = tf.gather_nd(logits, shuffled_indices) sorted_labels, sorted_logits = utils.sort_by_scores( shuffled_labels, [shuffled_labels, shuffled_logits]) raw_max = tf.reduce_max(input_tensor=sorted_logits, axis=1, keepdims=True) sorted_logits = sorted_logits - raw_max sums = tf.cumsum(tf.exp(sorted_logits), axis=1, reverse=True) sums = tf.math.log(sums) - sorted_logits if self._lambda_weight is not None and isinstance( self._lambda_weight, ListMLELambdaWeight): sums *= self._lambda_weight.individual_weights(sorted_labels) negative_log_likelihood = tf.reduce_sum(input_tensor=sums, axis=1) return negative_log_likelihood, weights
def mean_reciprocal_rank(labels, predictions, weights=None, name=None): """Computes mean reciprocal rank (MRR). Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. name: A string used as the name for this metric. Returns: A metric for the weighted mean reciprocal rank of the batch. """ with ops.name_scope(name, 'mean_reciprocal_rank', (labels, predictions, weights)): _, list_size = array_ops.unstack(array_ops.shape(predictions)) labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, list_size) sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn) # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance. relevance = math_ops.to_float( math_ops.greater_equal( sorted_labels, 1.0)) reciprocal_rank = 1.0 / math_ops.to_float(math_ops.range(1, topn + 1)) # MRR has a shape of [batch_size, 1] mrr = math_ops.reduce_max( relevance * reciprocal_rank, axis=1, keepdims=True) return math_ops.reduce_mean( mrr * array_ops.ones_like(weights) * weights)
def _per_list_precision(labels, predictions, topn, mask): """Computes the precision for each query in the batch. Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. topn: A cutoff for how many examples to consider for this metric. mask: A `Tensor` of the same shape as predictions indicating which entries are valid for computing the metric. Returns: A `Tensor` of size [batch_size, 1] containing the precision of each query respectively. """ sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0] # Relevance = 1.0 when labels >= 1.0. relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) if topn is None: topn = tf.shape(relevance)[1] valid_topn = tf.minimum( topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True)) per_list_precision = tf.compat.v1.math.divide_no_nan( tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), tf.cast(valid_topn, dtype=tf.float32)) return per_list_precision
def _per_list_recall(labels, predictions, topn, mask): """Computes the recall@k for each query in the batch. Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. topn: A cutoff for how many examples to consider for this metric. mask: A mask indicating which entries are valid for computing the metric. Returns: A `Tensor` of size [batch_size, 1] containing the precision of each query respectively. """ sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0] topn_positives = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) labels = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) per_list_recall = tf.compat.v1.math.divide_no_nan( tf.reduce_sum(input_tensor=topn_positives, axis=1, keepdims=True), tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)) return per_list_recall
def compute(self, labels, predictions, weights): """See `_RankingMetric`.""" labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, self._topn) sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn) # Relevance = 1.0 when labels >= 1.0. sorted_relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) per_list_relevant_counts = tf.cumsum(sorted_relevance, axis=1) per_list_cutoffs = tf.cumsum(tf.ones_like(sorted_relevance), axis=1) per_list_precisions = tf.math.divide_no_nan(per_list_relevant_counts, per_list_cutoffs) total_precision = tf.reduce_sum(input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True) total_relevance = tf.reduce_sum(input_tensor=sorted_weights * sorted_relevance, axis=1, keepdims=True) per_list_map = tf.math.divide_no_nan(total_precision, total_relevance) # per_list_weights are computed from the whole list to avoid the problem of # 0 when there is no relevant example in topn. per_list_weights = _per_example_weights_to_per_list_weights( weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)) return per_list_map, per_list_weights
def _compute_per_list_metric(self, labels, predictions, weights, topn, mask): """See `_DivRankingMetric`.""" sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0] # relevance shape = [batch_size, topn]. relevance = tf.reduce_sum(tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32), axis=-1) # num_subtopics shape = [batch_size, 1]. num_subtopics = tf.reduce_sum(tf.cast(tf.reduce_any(tf.greater_equal( labels, 1.0), axis=1, keepdims=True), dtype=tf.float32), axis=-1) if topn is None: topn = tf.shape(relevance)[1] # valid_topn shape = [batch_size, 1]. valid_topn = tf.minimum( topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True)) return tf.compat.v1.math.divide_no_nan( tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), tf.reduce_sum(input_tensor=tf.cast(valid_topn, dtype=tf.float32) * num_subtopics, axis=1, keepdims=True))
def _compute_impl(self, labels, predictions, weights, mask): """See `_RankingMetric`.""" topn = tf.shape(predictions)[1] if self._topn is None else self._topn # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance. relevance = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32) irrelevance = tf.cast(mask, tf.float32) - relevance total_relevance = tf.reduce_sum(relevance, axis=1, keepdims=True) total_irrelevance = tf.reduce_sum(irrelevance, axis=1, keepdims=True) sorted_relevance, sorted_irrelevance = utils.sort_by_scores( predictions, [relevance, irrelevance], mask=mask, topn=topn) numerator = tf.minimum(tf.cumsum(sorted_irrelevance, axis=1), total_relevance) denominator = tf.minimum( total_irrelevance, total_relevance) if self._use_trec_version else total_relevance bpref = tf.math.divide_no_nan( tf.reduce_sum( ((1. - tf.math.divide_no_nan(numerator, denominator)) * sorted_relevance), axis=1, keepdims=True), total_relevance) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=tf.cast(tf.greater_equal(relevance, 1.0), dtype=tf.float32)) return bpref, per_list_weights
def inverse_max_dcg(labels, gain_fn=lambda labels: tf.pow(2.0, labels) - 1., rank_discount_fn=lambda rank: 1. / tf.math.log1p(rank), topn=None): """Computes the inverse of max DCG. Args: labels: A `Tensor` with shape [batch_size, list_size]. Each value is the graded relevance of the corresponding item. gain_fn: A gain function. By default this is set to: 2^label - 1. rank_discount_fn: A discount function. By default this is set to: 1/log(1+rank). topn: An integer as the cutoff of examples in the sorted list. Returns: A `Tensor` with shape [batch_size, 1]. """ ideal_sorted_labels, = utils.sort_by_scores(labels, [labels], topn=topn) rank = tf.range(tf.shape(input=ideal_sorted_labels)[1]) + 1 discounted_gain = gain_fn(ideal_sorted_labels) * rank_discount_fn( tf.cast(rank, dtype=tf.float32)) discounted_gain = tf.reduce_sum(input_tensor=discounted_gain, axis=1, keepdims=True) return tf.compat.v1.where(tf.greater(discounted_gain, 0.), 1. / discounted_gain, tf.zeros_like(discounted_gain))
def test_sort_by_scores_with_mask_and_shuffle_ties(self): with tf.Graph().as_default(): tf.random.set_seed(42) scores = [[0., math.inf, 0., -math.inf, -math.inf]] names = [['a', 'b', 'c', 'd', 'e']] mask = [[True, False, True, True, False]] with tf.compat.v1.Session() as sess: result = utils.sort_by_scores(scores, [names], mask=mask, shuffle_ties=True, seed=13) sorted_names = sess.run(result)[0] self.assertAllEqual(sorted_names, [[b'a', b'c', b'd', b'b', b'e']]) result = utils.sort_by_scores(scores, [names], mask=mask, shuffle_ties=True, seed=17) sorted_names = sess.run(result)[0] self.assertAllEqual(sorted_names, [[b'c', b'a', b'd', b'e', b'b']])
def compute(self, labels, predictions, weights): """See `_RankingMetric`.""" labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, self._topn) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights) # Sorting over the weighted labels to get ideal ranking. ideal_sorted_labels, ideal_sorted_weights = utils.sort_by_scores( weights * labels, [labels, weights], topn=topn) ideal_dcg = _discounted_cumulative_gain(ideal_sorted_labels, ideal_sorted_weights) per_list_ndcg = tf.compat.v1.math.divide_no_nan(dcg, ideal_dcg) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=tf.pow(2.0, tf.cast(labels, dtype=tf.float32)) - 1.0) return tf.compat.v1.metrics.mean(per_list_ndcg, per_list_weights)
def test_sort_by_scores_with_mask(self): with tf.Graph().as_default(): scores = [[0., math.inf, 2., -math.inf, 1.]] names = [['a', 'b', 'c', 'd', 'e']] mask_1 = [[True, False, True, True, False]] mask_2 = [[False, True, False, True, True]] with tf.compat.v1.Session() as sess: sorted_names = sess.run( utils.sort_by_scores(scores, [names], mask=mask_1, shuffle_ties=False))[0] self.assertAllEqual(sorted_names, [[b'c', b'a', b'd', b'b', b'e']]) sorted_names = sess.run( utils.sort_by_scores(scores, [names], mask=mask_2, shuffle_ties=False))[0] self.assertAllEqual(sorted_names, [[b'b', b'e', b'd', b'a', b'c']]) sorted_names = sess.run( utils.sort_by_scores(scores, [names], shuffle_ties=False))[0] self.assertAllEqual(sorted_names, [[b'b', b'c', b'e', b'a', b'd']])
def test_sort_by_scores(self): scores = [[1., 3., 2.], [1., 2., 3.]] positions = [[1, 2, 3], [4, 5, 6]] names = [['a', 'b', 'c'], ['d', 'e', 'f']] with tf.compat.v1.Session() as sess: sorted_positions, sorted_names = sess.run( utils.sort_by_scores(scores, [positions, names])) self.assertAllEqual(sorted_positions, [[2, 3, 1], [6, 5, 4]]) self.assertAllEqual(sorted_names, [[b'b', b'c', b'a'], [b'f', b'e', b'd']]) sorted_positions, sorted_names = sess.run( utils.sort_by_scores(scores, [positions, names], topn=2)) self.assertAllEqual(sorted_positions, [[2, 3], [6, 5]]) self.assertAllEqual(sorted_names, [[b'b', b'c'], [b'f', b'e']]) sorted_positions, sorted_names = sess.run( utils.sort_by_scores([scores[0]], [[positions[0]], [names[0]]])) self.assertAllEqual(sorted_positions, [[2, 3, 1]]) self.assertAllEqual(sorted_names, [[b'b', b'c', b'a']])
def _compute_per_list_metric(self, labels, predictions, weights, topn): """See `_DivRankingMetric`.""" sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, seed=self._seed) alpha_dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn) per_list_weights = self._compute_per_list_weights(weights, labels) return tf.compat.v1.math.divide_no_nan(alpha_dcg, per_list_weights)
def compute(self, labels, predictions, weights): """See `_RankingMetric`.""" list_size = tf.shape(input=predictions)[1] labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, list_size) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) relevance = sorted_labels * sorted_weights position = tf.cast(tf.range(1, topn + 1), dtype=tf.float32) # TODO: Consider to add a cap poistion topn + 1 when there is no # relevant examples. return position * tf.ones_like(relevance), relevance
def compute(self, labels, predictions, weights): """See `_RankingMetric`.""" labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, self._topn) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))) per_list_dcg = tf.compat.v1.math.divide_no_nan(dcg, per_list_weights) return per_list_dcg, per_list_weights
def _compute_impl(self, labels, predictions, weights, mask): """See `_RankingMetric`.""" topn = tf.shape(predictions)[1] if self._topn is None else self._topn sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask) dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn) # Sorting over the weighted labels to get ideal ranking. ideal_sorted_labels, ideal_sorted_weights = utils.sort_by_scores( weights * labels, [labels, weights], topn=topn, mask=mask) ideal_dcg = _discounted_cumulative_gain(ideal_sorted_labels, ideal_sorted_weights, self._gain_fn, self._rank_discount_fn) per_list_ndcg = tf.compat.v1.math.divide_no_nan(dcg, ideal_dcg) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))) return per_list_ndcg, per_list_weights
def mean_average_precision(labels, predictions, weights=None, topn=None, name=None): """Computes mean average precision (MAP). The implementation of MAP is based on Equation (1.7) in the following: Liu, T-Y "Learning to Rank for Information Retrieval" found at https://www.nowpublishers.com/article/DownloadSummary/INR-016 Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. name: A string used as the name for this metric. Returns: A metric for the mean average precision. """ with tf.compat.v1.name_scope(metric.name, 'mean_average_precision', (labels, predictions, weights)): labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, topn) sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn) # Relevance = 1.0 when labels >= 1.0. sorted_relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32) per_list_relevant_counts = tf.cumsum(sorted_relevance, axis=1) per_list_cutoffs = tf.cumsum(tf.ones_like(sorted_relevance), axis=1) per_list_precisions = tf.math.divide_no_nan(per_list_relevant_counts, per_list_cutoffs) total_precision = tf.reduce_sum(input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True) total_relevance = tf.reduce_sum(input_tensor=sorted_weights * sorted_relevance, axis=1, keepdims=True) per_list_map = tf.math.divide_no_nan(total_precision, total_relevance) # per_list_weights are computed from the whole list to avoid the problem of # 0 when there is no relevant example in topn. per_list_weights = _per_example_weights_to_per_list_weights( weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)) return tf.compat.v1.metrics.mean(per_list_map, per_list_weights)
def compute(self, labels, predictions, weights): """See `_RankingMetric`.""" labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, self._topn) sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn) dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights) * tf.math.log1p(1.0) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=tf.pow(2.0, tf.cast(labels, dtype=tf.float32)) - 1.0) return tf.compat.v1.metrics.mean(_safe_div(dcg, per_list_weights), per_list_weights)
def _inverse_max_dcg(self, labels): """Computes the inverse of max DCG.""" ideal_sorted_labels, = utils.sort_by_scores(labels, [labels], topn=self._topn) rank = math_ops.range(array_ops.shape(ideal_sorted_labels)[1]) + 1 discounted_gain = self._gain_fn( ideal_sorted_labels) * self._rank_discount_fn( math_ops.to_float(rank)) discounted_gain = math_ops.reduce_sum(discounted_gain, 1, keepdims=True) return array_ops.where(math_ops.greater(discounted_gain, 0.), 1. / discounted_gain, array_ops.zeros_like(discounted_gain))
def bilingual_lexical_induction(labels, predictions, features): """Compute the BLI. We do not make all the needed verifications as they were already made for previous metrics.""" if FLAGS.query_relevance_type == "binary": ground_truth = 2 else: ground_truth = FLAGS.query_size # We get the label of the highest ranked word by the model sorted_labels = utils.sort_by_scores(predictions, [labels], topn=1)[0] # We check if the label is equal to ground truth relevance = tf.cast(tf.equal(sorted_labels, ground_truth), dtype=tf.float32) # We return it return tf.compat.v1.metrics.mean(relevance)
def normalized_discounted_cumulative_gain(labels, predictions, weights=None, topn=None, name=None): """Computes normalized discounted cumulative gain (NDCG). Args: labels: A `Tensor` of the same shape as `predictions`. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. name: A string used as the name for this metric. Returns: A metric for the weighted normalized discounted cumulative gain of the batch. """ with tf.compat.v1.name_scope(name, 'normalized_discounted_cumulative_gain', (labels, predictions, weights)): labels, predictions, weights, topn = _prepare_and_validate_params( labels, predictions, weights, topn) sorted_labels, sorted_weights = utils.sort_by_scores( predictions, [labels, weights], topn=topn) dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights) # Sorting over the weighted labels to get ideal ranking. ideal_sorted_labels, ideal_sorted_weights = utils.sort_by_scores( weights * labels, [labels, weights], topn=topn) ideal_dcg = _discounted_cumulative_gain(ideal_sorted_labels, ideal_sorted_weights) per_list_ndcg = _safe_div(dcg, ideal_dcg) per_list_weights = _per_example_weights_to_per_list_weights( weights=weights, relevance=tf.pow(2.0, tf.cast(labels, dtype=tf.float32)) - 1.0) return tf.compat.v1.metrics.mean(per_list_ndcg, per_list_weights)