def test_make_mean_reciprocal_rank_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order. # ranks = [[3, 1, 2], [3, 2, 1]] labels = [[0., 0., 1.], [0., 1., 2.]] # Note that the definition of MRR only uses the highest ranked # relevant item, where an item is relevant if its label is > 0. rel_rank = [2, 1] weights = [[1., 2., 3.], [4., 5., 6.]] num_queries = len(scores) weights_feature_name = 'weights' features = {weights_feature_name: weights} m = metrics_lib.make_ranking_metric_fn(metrics_lib.RankingMetricKey.MRR) m_w = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.MRR, weights_feature_name=weights_feature_name) m_2 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.MRR, topn=1) self._check_metrics([ (m([labels[0]], [scores[0]], features), 0.5), (m(labels, scores, features), (0.5 + 1.0) / 2), (m_w(labels, scores, features), (3. * 0.5 + (6. + 5.) / 2. * 1.) / (3. + (6. + 5.) / 2.)), (m_2(labels, scores, features), (sum([0., 1. / rel_rank[1], 0.]) / num_queries)), ])
def test_make_discounted_cumulative_gain_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 1., 1.], [2., 2., 1.]] weights_feature_name = 'weights' features = {weights_feature_name: weights} m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.DCG) m_w = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.DCG, weights_feature_name=weights_feature_name) expected_dcg_1 = _dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3) self._check_metrics([ (m([labels[0]], [scores[0]], features), expected_dcg_1), ]) expected_dcg_2 = _dcg(2., 1) + _dcg(1., 2) expected_dcg_2_weighted = _dcg(2., 1) + _dcg(1., 2) * 2. expected_weight_2 = ((4 - 1) * 1. + (2 - 1) * 2.) / (4 - 1 + 2 - 1) self._check_metrics([ (m(labels, scores, features), (expected_dcg_1 + expected_dcg_2) / 2.0), (m_w(labels, scores, features), (expected_dcg_1 + expected_dcg_2_weighted) / (1. + expected_weight_2)), ])
def test_make_bpref_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order. # ranks = [[3, 1, 2], [3, 2, 1]] labels = [[0., 0., 1.], [1., 0., 2.]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_feature_name = 'weights' features = {weights_feature_name: weights} # BPref = 1 / R SUM_r(1- |n ranked higher than r| / min(R, N)) m = metrics_lib.make_ranking_metric_fn(metrics_lib.RankingMetricKey.BPREF) m_w = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.BPREF, weights_feature_name=weights_feature_name) m_2 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.BPREF, topn=1) m_alt = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.BPREF, use_trec_version=False) self._check_metrics([ (m([labels[0]], [scores[0]], features), 1. / 2. * (1. - 1. / 1.)), # = 0. (m(labels, scores, features), (1. / 2. * (1. - 1. / 1.) + (1. / 2. * ((1. - 0. / 1.) + (1. - 1. / 1.)))) / 2), # = 0.25 (m_w(labels, scores, features), (3. * (1. / 2. * (1. - 1. / 1.)) + 5. * (1. / 2. * ((1. - 0. / 1.) + (1. - 1. / 1.)))) / (3. + 5.)), (m_2(labels, scores, features), (0. + (1. / 2. * (1. - 0. / 1.))) / 2.), (m_alt(labels, scores, features), (1. / 2. * (1. - 1. / 1.) + (1. / 2. * ((1. - 0. / 2.) + (1. - 1. / 2.)))) / 2), # = 0.5 ])
def test_make_hits_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order. # ranks = [[3, 1, 2], [3, 2, 1]] labels = [[0., 0., 1.], [0., 1., 1.]] # Note that the definition of Hits considers an item relevant # if its label is >= 1.0. weights = [[1., 2., 3.], [4., 5., 6.]] num_queries = len(scores) weights_feature_name = 'weights' features = {weights_feature_name: weights} m = metrics_lib.make_ranking_metric_fn(metrics_lib.RankingMetricKey.HITS) m_2 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.HITS, topn=1) m_w = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.HITS, topn=1, weights_feature_name=weights_feature_name) self._check_metrics([ (m([labels[0]], [scores[0]], features), 1.0), (m_2(labels, scores, features), ((0. + 1.) / num_queries)), (m_w(labels, scores, features), (3. * 0. + (6. + 5.) / 2. * 1.) / (3. + (6. + 5.) / 2.)), ])
def test_make_discounted_cumulative_gain_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order. ranks = [[3, 1, 2], [3, 2, 1]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 1., 1.], [2., 2., 1.]] weights_feature_name = 'weights' features = {weights_feature_name: weights} m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.DCG) m_w = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.DCG, weights_feature_name=weights_feature_name) expected_dcg_1 = _dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3) self._check_metrics([ (m([labels[0]], [scores[0]], features), expected_dcg_1), ]) expected_dcg_2 = _dcg(2., 1) + _dcg(1., 2) expected_dcg_2_weighted = _dcg(2., 1) + _dcg(1., 2) * 2. expected_weight_2 = ((4 - 1) * 1. + (2 - 1) * 2.) / (4 - 1 + 2 - 1) self._check_metrics([ (m(labels, scores, features), (expected_dcg_1 + expected_dcg_2) / 2.0), (m_w(labels, scores, features), (expected_dcg_1 + expected_dcg_2_weighted) / (1. + expected_weight_2)), ]) # Testing different gain and discount functions gain_fn = lambda rel: rel rank_discount_fn = lambda rank: rank def mod_dcg_fn(l, r): return _dcg(l, r, gain_fn=gain_fn, rank_discount_fn=rank_discount_fn) m_mod = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.DCG, gain_fn=gain_fn, rank_discount_fn=rank_discount_fn) list_size = len(scores[0]) expected_modified_dcg_1 = sum([ mod_dcg_fn(labels[0][ind], ranks[0][ind]) for ind in range(list_size) ]) self._check_metrics([ (m_mod([labels[0]], [scores[0]], features), expected_modified_dcg_1), ])
def test_make_mean_reciprocal_rank_fn(self): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_feature_name = 'weights' features = {weights_feature_name: weights} m = metrics.make_ranking_metric_fn(metrics.RankingMetricKey.MRR) m_w = metrics.make_ranking_metric_fn( metrics.RankingMetricKey.MRR, weights_feature_name=weights_feature_name) self._check_metrics([ (m([labels[0]], [scores[0]], features), 0.5), (m(labels, scores, features), (0.5 + 1.0) / 2), (m_w(labels, scores, features), (6. * 0.5 + 15. * 1.) / (6. + 15.)), ])
def test_make_precision_fn(self): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] features = {} m = metrics.make_ranking_metric_fn(metrics.RankingMetricKey.PRECISION) m_top_1 = metrics.make_ranking_metric_fn( metrics.RankingMetricKey.PRECISION, topn=1) m_top_2 = metrics.make_ranking_metric_fn( metrics.RankingMetricKey.PRECISION, topn=2) self._check_metrics([ (m([labels[0]], [scores[0]], features), 1. / 3.), (m_top_1([labels[0]], [scores[0]], features), 0. / 1.), (m_top_2([labels[0]], [scores[0]], features), 1. / 2.), (m(labels, scores, features), (1. / 3. + 2. / 3.) / 2.), ])
def test_make_average_relevance_position_fn(self): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_feature_name = 'weights' features = {weights_feature_name: weights} m = metrics.make_ranking_metric_fn(metrics.RankingMetricKey.ARP) m_w = metrics.make_ranking_metric_fn( metrics.RankingMetricKey.ARP, weights_feature_name=weights_feature_name) self._check_metrics([ (m([labels[0]], [scores[0]], features), 2.), (m(labels, scores, features), (1. * 2. + 2. * 1. + 1. * 2.) / 4.), (m_w(labels, scores, features), (3. * 1. * 2. + 6. * 2. * 1. + 5 * 1. * 2.) / (3. + 12. + 5.)), ])
def test_multi_dim_weighted_eval(self): weights_feature_name = self._default_weights_feature_name metric_fns = { 'metric/precision@1': metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.PRECISION, topn=1), } head = ranking_head.create_ranking_head( loss_fn=_make_loss_fn(weights_feature_name), eval_metric_fns=metric_fns) weights = self._default_weights # Create estimator spec. spec = head.create_estimator_spec( features={weights_feature_name: weights}, mode=tf.estimator.ModeKeys.EVAL, logits=self._default_logits, labels=self._default_labels) expected_metrics = [ 'labels_mean', 'logits_mean', 'metric/precision@1', ] with self.cached_session() as sess: _initialize_variables(self, spec.scaffold) update_ops = { k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops } loss, metrics = sess.run((spec.loss, update_ops)) self.assertAllClose(self._default_weighted_loss, loss) self.assertItemsEqual(expected_metrics, metrics.keys())
def _eval_metric_fns(self): """Returns a dict from name to metric functions.""" metric_fns = {} metric_fns.update({ "metric/ndcg_%d" % topn: metrics.make_ranking_metric_fn( metrics.RankingMetricKey.NDCG, topn=topn) for topn in [5, 10] }) metric_fns.update({ "metric/mrr_%d" % topn: metrics.make_ranking_metric_fn(metrics.RankingMetricKey.MRR, topn=topn) for topn in [10] }) metric_fns.update({ "metric/%s" % name: metrics.make_ranking_metric_fn(name) for name in [metrics.RankingMetricKey.MRR, metrics.RankingMetricKey.NDCG] }) return metric_fns
def test_make_recall_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[1., 0., 1.], [0., 1., 2.]] features = {} m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.RECALL) m_top_1 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.RECALL, topn=1) m_top_2 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.RECALL, topn=2) self._check_metrics([ (m([labels[0]], [scores[0]], features), 2. / 2.), (m_top_1([labels[0]], [scores[0]], features), 0. / 2.), (m_top_2([labels[0]], [scores[0]], features), 1. / 2.), (m_top_2(labels, scores, features), (1. / 2. + 2. / 2.) / 2.), ])
def test_eval(self): with tf.Graph().as_default(): metric_fns = { 'metric/precision@1': metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.PRECISION, topn=1), } head1 = ranking_head.create_ranking_head( loss_fn=_make_loss_fn(), eval_metric_fns=metric_fns, name='head1') head2 = ranking_head.create_ranking_head( loss_fn=_make_loss_fn(), eval_metric_fns=metric_fns, name='head2') multi_head = ranking_head.create_multi_ranking_head([head1, head2]) logits = { 'head1': tf.convert_to_tensor(value=[[1., 3.], [1., 2.]]), 'head2': tf.convert_to_tensor(value=[[2., 3.], [2., 2.]]), } labels = { 'head1': tf.convert_to_tensor(value=[[0., 1.], [0., 2.]]), 'head2': tf.convert_to_tensor(value=[[0., 1.], [0., 2.]]), } spec = multi_head.create_estimator_spec( features={}, mode=tf.estimator.ModeKeys.EVAL, logits=logits, labels=labels) expected_metrics = [ 'head1/labels_mean', 'head1/logits_mean', 'head1/metric/precision@1', 'head2/labels_mean', 'head2/logits_mean', 'head2/metric/precision@1', ] # Assert spec contains expected tensors. self.assertIsNotNone(spec.loss) self.assertIsNone(spec.train_op) self.assertIsNone(spec.export_outputs) self.assertCountEqual(spec.eval_metric_ops.keys(), expected_metrics) # Assert predictions, loss, and metrics. with self.cached_session() as sess: _initialize_variables(self, spec.scaffold) self.assertIsNone(spec.scaffold.summary_op) update_ops = { k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops } loss, metrics = sess.run((spec.loss, update_ops)) self.assertAllClose(loss, 10.) self.assertItemsEqual(metrics.keys(), expected_metrics)
def test_make_precision_ia_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[[0., 0.], [0., 0.], [1., 0.]], [[0., 0.], [1., 0.], [1., 1.]]] features = {} m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.PRECISION_IA) m_top_1 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.PRECISION_IA, topn=1) m_top_2 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.PRECISION_IA, topn=2) self._check_metrics([ (m([labels[0]], [scores[0]], features), 1. / 3.), (m_top_1([labels[0]], [scores[0]], features), 0. / 1.), (m_top_2([labels[0]], [scores[0]], features), 1. / 2.), (m(labels, scores, features), (1. / 3. + 3. / 6.) / 2.), ])
def test_make_ordered_pair_accuracy_fn(self): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] m = metrics.make_ranking_metric_fn( metrics.RankingMetricKey.ORDERED_PAIR_ACCURACY) self._check_metrics([ (m([labels[0]], [scores[0]], {}), 1. / 2.), (m([labels[1]], [scores[1]], {}), 1.), (m(labels, scores, {}), (1. + 3.) / (2. + 3.)), ])
def _get_metric_pair(key, weight=None, topn=None): """Helper function to construct metric name and function.""" name = "".join([ "metric/", "weighted_" if weight else "", key, "_%s" % topn if topn else "", ]) return name, metrics.make_ranking_metric_fn( key, weights_feature_name=weight, topn=topn)
def test_make_mean_average_precision_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order, so the ranks are # [[3, 1, 2], [3, 2, 1]] labels = [[0., 0., 1.], [0., 1., 2.]] rels = [[0, 0, 1], [0, 1, 1]] features = {} m = metrics_lib.make_ranking_metric_fn(metrics_lib.RankingMetricKey.MAP) m_top_1 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.MAP, topn=1) m_top_2 = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.MAP, topn=2) self._check_metrics([ (m([labels[0]], [scores[0]], features), _ap(rels[0], scores[0])), (m_top_1([labels[0]], [scores[0]], features), _ap(rels[0], scores[0], topn=1)), (m_top_2([labels[0]], [scores[0]], features), _ap(rels[0], scores[0], topn=2)), (m(labels, scores, features), sum(_ap(rels[i], scores[i]) for i in range(2)) / 2.), ])
def test_make_normalized_discounted_cumulative_gain_fn(self): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_feature_name = 'weights' features = {weights_feature_name: weights[0]} m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG) expected_ndcg = (_dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3)) / ( _dcg(1., 1) + _dcg(0., 2) + _dcg(0., 3)) self._check_metrics([ (m([labels[0]], [scores[0]], features), expected_ndcg), ]) expected_ndcg_1 = (_dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3)) / ( _dcg(1., 1) + _dcg(0., 2) + _dcg(0., 3)) expected_ndcg_2 = 1.0 expected_ndcg = (expected_ndcg_1 + expected_ndcg_2) / 2.0 self._check_metrics([ (m(labels, scores, features), expected_ndcg), ]) # With weights. m_top = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_feature_name, topn=1) m_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_feature_name) self._check_metrics([ (m_top([labels[0]], [scores[0]], features), _dcg(0., 1, 2.) / _dcg(1., 1, 3.)), (m_weight([labels[0]], [scores[0]], features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), ])
def test_eval(self): with tf.Graph().as_default(): metric_fns = { 'metric/precision@1': metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.PRECISION, topn=1), } head = ranking_head.create_ranking_head(loss_fn=_make_loss_fn(), eval_metric_fns=metric_fns) # Create estimator spec. spec = head.create_estimator_spec( features=self._default_features_dict, mode=tf.estimator.ModeKeys.EVAL, logits=self._default_logits, labels=self._default_labels) expected_metrics = [ 'labels_mean', 'logits_mean', 'metric/precision@1', ] # Assert spec contains expected tensors. self.assertIsNotNone(spec.loss) self.assertIsNone(spec.train_op) self.assertIsNone(spec.export_outputs) self.assertItemsEqual(expected_metrics, spec.eval_metric_ops.keys()) # Assert predictions, loss, and metrics. with self.cached_session() as sess: _initialize_variables(self, spec.scaffold) self.assertIsNone(spec.scaffold.summary_op) update_ops = { k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops } loss, metrics = sess.run((spec.loss, update_ops)) self.assertAllClose(self._default_loss, loss) self.assertItemsEqual(expected_metrics, metrics.keys())
def test_make_normalized_discounted_cumulative_gain_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order. ranks = [[3, 1, 2], [3, 2, 1]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_3d = [[[1.], [2.], [3.]], [[4.], [5.], [6.]]] list_weights = [1., 0.] list_weights_2d = [[1.], [0.]] weights_feature_name = 'weights' weights_invalid_feature_name = 'weights_invalid' weights_3d_feature_name = 'weights_3d' list_weights_name = 'list_weights' list_weights_2d_name = 'list_weights_2d' features = { weights_feature_name: [weights[0]], weights_invalid_feature_name: weights[0], weights_3d_feature_name: [weights_3d[0]], list_weights_name: list_weights, list_weights_2d_name: list_weights_2d } m = metrics_lib.make_ranking_metric_fn(metrics_lib.RankingMetricKey.NDCG) expected_ndcg = (_dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3)) / ( _dcg(1., 1) + _dcg(0., 2) + _dcg(0., 3)) self._check_metrics([ (m([labels[0]], [scores[0]], features), expected_ndcg), ]) expected_ndcg_1 = (_dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3)) / ( _dcg(1., 1) + _dcg(0., 2) + _dcg(0., 3)) expected_ndcg_2 = 1.0 expected_ndcg = (expected_ndcg_1 + expected_ndcg_2) / 2.0 self._check_metrics([ (m(labels, scores, features), expected_ndcg), ]) # With item-wise weights. m_top = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_feature_name, topn=1) m_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_feature_name) m_weights_3d = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_3d_feature_name) self._check_metrics([ (m_top([labels[0]], [scores[0]], features), _dcg(0., 1, 2.) / _dcg(1., 1, 3.)), (m_weight([labels[0]], [scores[0]], features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), (m_weights_3d([labels[0]], [scores[0]], features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), ]) with self.assertRaises(ValueError): m_weight_invalid = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_invalid_feature_name) m_weight_invalid([labels[0]], [scores[0]], features) # With list-wise weights. m_list_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=list_weights_name) m_list_weight_2d = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=list_weights_2d_name) self._check_metrics([ (m_list_weight(labels, scores, features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), (m_list_weight_2d(labels, scores, features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), ]) # Testing different gain and discount functions gain_fn = lambda rel: rel rank_discount_fn = lambda rank: 1. / rank def mod_dcg_fn(l, r): return _dcg(l, r, gain_fn=gain_fn, rank_discount_fn=rank_discount_fn) m_mod = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, gain_fn=gain_fn, rank_discount_fn=rank_discount_fn) list_size = len(scores[0]) expected_modified_dcg_1 = sum([ mod_dcg_fn(labels[0][ind], ranks[0][ind]) for ind in range(list_size) ]) self._check_metrics([ (m_mod([labels[0]], [scores[0]], features), expected_modified_dcg_1), ])
def test_make_alpha_discounted_cumulative_gain_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] # Note that scores are ranked in descending order. # ranks = [[3, 1, 2], [3, 2, 1]] labels = [[[0., 0.], [0., 1.], [0., 1.]], [[0., 0.], [1., 0.], [1., 1.]]] # cum_labels = [[[0., 2.], [0., 0.], [0., 1.]], # [[2., 1.], [1., 1.], [0., 0.]]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_3d = [[[1.], [2.], [3.]], [[4.], [5.], [6.]]] list_weights = [1., 0.] list_weights_2d = [[1.], [0.]] weights_feature_name = 'weights' weights_invalid_feature_name = 'weights_invalid' weights_3d_feature_name = 'weights_3d' list_weights_name = 'list_weights' list_weights_2d_name = 'list_weights_2d' features = { weights_feature_name: [weights[0]], weights_invalid_feature_name: weights[0], weights_3d_feature_name: [weights_3d[0]], list_weights_name: list_weights, list_weights_2d_name: list_weights_2d } m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG) expected_alphadcg = (_alpha_dcg([0., 1.], [0., 0.], 1) + _alpha_dcg([0., 1.], [0., 1.], 2) + _alpha_dcg([0., 0.], [0., 2.], 3)) self._check_metrics([ (m([labels[0]], [scores[0]], features), expected_alphadcg), ]) expected_alphadcg_1 = (_alpha_dcg([0., 1.], [0., 0.], 1) + _alpha_dcg([0., 1.], [0., 1.], 2) + _alpha_dcg([0., 0.], [0., 2.], 3)) expected_alphadcg_2 = (_alpha_dcg([1., 1.], [0., 0.], 1) + _alpha_dcg([1., 0.], [1., 1.], 2) + _alpha_dcg([0., 0.], [2., 1.], 3)) expected_alphadcg = (expected_alphadcg_1 + expected_alphadcg_2) / 2.0 self._check_metrics([ (m(labels, scores, features), expected_alphadcg), ]) # With item-wise weights. m_top = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, weights_feature_name=weights_feature_name, topn=1) m_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, weights_feature_name=weights_feature_name) m_weights_3d = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, weights_feature_name=weights_3d_feature_name) self._check_metrics([ (m_top([labels[0]], [scores[0]], features), _alpha_dcg([0., 1.], [0., 0.], 1, 2.) / 2.5), (m_weight([labels[0]], [scores[0]], features), (_alpha_dcg([0., 1.], [0., 0.], 1, 2.) + _alpha_dcg([0., 1.], [0., 1.], 2, 3.) + _alpha_dcg([0., 0.], [0., 2.], 3, 1.)) / 2.5), (m_weights_3d([labels[0]], [scores[0]], features), (_alpha_dcg([0., 1.], [0., 0.], 1, 2.) + _alpha_dcg([0., 1.], [0., 1.], 2, 3.) + _alpha_dcg([0., 0.], [0., 2.], 3, 1.)) / 2.5), ]) with self.assertRaises(ValueError): m_weight_invalid = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, weights_feature_name=weights_invalid_feature_name) m_weight_invalid([labels[0]], [scores[0]], features) # With list-wise weights. m_list_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, weights_feature_name=list_weights_name) m_list_weight_2d = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, weights_feature_name=list_weights_2d_name) self._check_metrics([ (m_list_weight(labels, scores, features), (_alpha_dcg([0., 1.], [0., 0.], 1, 1.) + _alpha_dcg([0., 1.], [0., 1.], 2, 1.) + _alpha_dcg([0., 0.], [0., 2.], 3, 1.))), (m_list_weight_2d(labels, scores, features), (_alpha_dcg([0., 1.], [0., 0.], 1, 1.) + _alpha_dcg([0., 1.], [0., 1.], 2, 1.) + _alpha_dcg([0., 0.], [0., 2.], 3, 1.))), ]) # Test different gain and discount functions. alpha = 0.2 rank_discount_fn = lambda rank: 1. / rank mod_alpha_dcg_fn = functools.partial( _alpha_dcg, alpha=alpha, rank_discount_fn=rank_discount_fn) m_mod = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.ALPHA_DCG, rank_discount_fn=rank_discount_fn, alpha=alpha) expected_modified_alphadcg_1 = ( mod_alpha_dcg_fn([0., 1.], [0., 0.], 1) + mod_alpha_dcg_fn([0., 1.], [0., 1.], 2) + mod_alpha_dcg_fn([0., 0.], [0., 2.], 3)) self._check_metrics([ (m_mod([labels[0]], [scores[0]], features), expected_modified_alphadcg_1), ])
def test_make_normalized_discounted_cumulative_gain_fn(self): with tf.Graph().as_default(): scores = [[1., 3., 2.], [1., 2., 3.]] labels = [[0., 0., 1.], [0., 1., 2.]] weights = [[1., 2., 3.], [4., 5., 6.]] weights_3d = [[[1.], [2.], [3.]], [[4.], [5.], [6.]]] list_weights = [1., 0.] list_weights_2d = [[1.], [0.]] weights_feature_name = 'weights' weights_invalid_feature_name = 'weights_invalid' weights_3d_feature_name = 'weights_3d' list_weights_name = 'list_weights' list_weights_2d_name = 'list_weights_2d' features = { weights_feature_name: [weights[0]], weights_invalid_feature_name: weights[0], weights_3d_feature_name: [weights_3d[0]], list_weights_name: list_weights, list_weights_2d_name: list_weights_2d } m = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG) expected_ndcg = (_dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3)) / ( _dcg(1., 1) + _dcg(0., 2) + _dcg(0., 3)) self._check_metrics([ (m([labels[0]], [scores[0]], features), expected_ndcg), ]) expected_ndcg_1 = (_dcg(0., 1) + _dcg(1., 2) + _dcg(0., 3)) / ( _dcg(1., 1) + _dcg(0., 2) + _dcg(0., 3)) expected_ndcg_2 = 1.0 expected_ndcg = (expected_ndcg_1 + expected_ndcg_2) / 2.0 self._check_metrics([ (m(labels, scores, features), expected_ndcg), ]) # With item-wise weights. m_top = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_feature_name, topn=1) m_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_feature_name) m_weights_3d = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_3d_feature_name) self._check_metrics([ (m_top([labels[0]], [scores[0]], features), _dcg(0., 1, 2.) / _dcg(1., 1, 3.)), (m_weight([labels[0]], [scores[0]], features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), (m_weights_3d([labels[0]], [scores[0]], features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), ]) with self.assertRaises(ValueError): m_weight_invalid = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=weights_invalid_feature_name) m_weight_invalid([labels[0]], [scores[0]], features) # With list-wise weights. m_list_weight = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=list_weights_name) m_list_weight_2d = metrics_lib.make_ranking_metric_fn( metrics_lib.RankingMetricKey.NDCG, weights_feature_name=list_weights_2d_name) self._check_metrics([ (m_list_weight(labels, scores, features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), (m_list_weight_2d(labels, scores, features), (_dcg(0., 1, 2.) + _dcg(1., 2, 3.) + _dcg(0., 3, 1.)) / (_dcg(1., 1, 3.) + _dcg(0., 2, 1.) + _dcg(0., 3, 2.))), ])