def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) ndcg1_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=1)) ndcg2_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=2)) # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0) # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1) # Query3 (weight=3): (p=0.9, g=1.0) # # DCG@1: 0.5, 1.0, 1.0 # NDCG@1: 0.5, 1.0, 1.0 # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92 # # DCG@2: (0.5 + 1.0/log(3), (1.0 + 0.5/log(3), (1.0) # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)), # (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)), # 1.0 # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97 self.assertDictElementsAlmostEqual(got_metrics, { ndcg1_key: 0.9166667, ndcg2_key: 0.9766198 }, places=5) except AssertionError as err: raise util.BeamAssertException(err)
def _macro_average_sub_keys( sub_key: Optional[metric_types.SubKey], class_weights: Dict[int, float]) -> Iterable[metric_types.SubKey]: """Returns sub-keys required in order to compute macro average sub-key. Args: sub_key: SubKey associated with macro_average or weighted_macro_average. class_weights: Class weights associated with sub-key. Raises: ValueError: If invalid sub-key passed or class weights required but not passed. """ if not sub_key: if not class_weights: raise ValueError( 'class_weights are required in order to compute macro average over ' 'all classes: sub_key={}, class_weights={}'.format( sub_key, class_weights)) return [metric_types.SubKey(class_id=i) for i in class_weights.keys()] elif sub_key.top_k: return [metric_types.SubKey(k=i + 1) for i in range(sub_key.top_k)] else: raise ValueError('invalid sub_key for performing macro averaging: ' 'sub_key={}'.format(sub_key))
def _create_sub_keys( spec: config.MetricsSpec) -> Optional[List[metric_types.SubKey]]: """Creates subkeys associated with spec.""" sub_keys = None if spec.HasField('binarize'): sub_keys = [] if spec.binarize.class_ids.values: for v in spec.binarize.class_ids.values: sub_keys.append(metric_types.SubKey(class_id=v)) if spec.binarize.k_list.values: for v in spec.binarize.k_list.values: sub_keys.append(metric_types.SubKey(k=v)) if spec.binarize.top_k_list.values: for v in spec.binarize.top_k_list.values: sub_keys.append(metric_types.SubKey(top_k=v)) if spec.aggregate.micro_average: # Micro averaging is performed by flattening the labels and predictions # and treating them as independent pairs. This is done by default by most # metrics whenever binarization is not used. If micro-averaging and # binarization are used, then we need to create an empty subkey to ensure # the overall aggregate key is still computed. Note that the class_weights # should always be passed to all metric calculations to ensure they are # taken into account when flattening is required. sub_keys.append(None) return sub_keys # pytype: disable=bad-return-type
def _create_sub_keys( spec: config.MetricsSpec ) -> Dict[Optional[metric_types.AggregationType], List[Optional[metric_types.SubKey]]]: """Creates sub keys per aggregation type.""" result = {} if spec.HasField('binarize'): sub_keys = [] if spec.binarize.class_ids.values: for v in spec.binarize.class_ids.values: sub_keys.append(metric_types.SubKey(class_id=v)) if spec.binarize.k_list.values: for v in spec.binarize.k_list.values: sub_keys.append(metric_types.SubKey(k=v)) if spec.binarize.top_k_list.values: for v in spec.binarize.top_k_list.values: sub_keys.append(metric_types.SubKey(top_k=v)) if sub_keys: result[None] = sub_keys if spec.HasField('aggregate'): sub_keys = [] for top_k in spec.aggregate.top_k_list.values: sub_keys.append(metric_types.SubKey(top_k=top_k)) if not sub_keys: sub_keys = [None] result[_aggregation_type(spec)] = sub_keys return result if result else {None: [None]}
def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] example_count_key = metric_types.MetricKey(name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key_class_0 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=0)) label_key_class_1 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=1)) label_key_class_2 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=2)) self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, weighted_example_count_key: (1.0 + 2.0 + 3.0 + 4.0), label_key_class_0: (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_1: (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_2: (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0) }) except AssertionError as err: raise util.BeamAssertException(err)
def _metric_keys(metrics: Iterable[tf.keras.metrics.Metric], model_name: Text, output_names: Iterable[Text]) -> List[metric_types.MetricKey]: """Returns metric keys for given metrics.""" # We need to use the metric name to determine the associated output because # keras does not provide an API (see b/149780822). Keras names its metrics # using the following format: # <output_name>_[weighted]_<metric_name> result = [] for metric in metrics: sub_key = None if hasattr(metric, 'class_id') and metric.class_id is not None: sub_key = metric_types.SubKey(class_id=metric.class_id) elif hasattr(metric, 'top_k') and metric.top_k is not None: sub_key = metric_types.SubKey(top_k=metric.top_k) for output_name in output_names or []: if metric.name.startswith(output_name + '_'): # TODO(b/171559113): Output prefixes used to be added multiple times. # Remove this while loop after the last TF version with the issue is # no longer supported. name = metric.name while name.startswith(output_name + '_'): name = name[len(output_name) + 1:] result.append( metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key)) break else: result.append( metric_types.MetricKey(name=metric.name, model_name=model_name, sub_key=sub_key)) return result
def test_partition_slices_with_metric_sub_key(self): metrics = self._get_metrics() # Set sub_key. for metric in metrics: for kv in metric.metric_keys_and_values: kv.key.sub_key.MergeFrom( metric_types.SubKey(class_id=0).to_proto()) result = auto_slicing_util.partition_slices( metrics, metric_key=metric_types.MetricKey( name='accuracy', sub_key=metric_types.SubKey(class_id=0)), comparison_type='LOWER') self.assertCountEqual([s.slice_key for s in result[0]], [(('age', '[1.0, 6.0)'), )]) self.assertCountEqual([s.slice_key for s in result[1]], [(('age', '[6.0, 12.0)'), ), (('age', '[12.0, 18.0)'), ), (('country', 'USA'), ), (('country', 'USA'), ('age', '[12.0, 18.0)'))]) result = auto_slicing_util.partition_slices( metrics, metric_key=metric_types.MetricKey( name='accuracy', sub_key=metric_types.SubKey(class_id=0)), comparison_type='HIGHER') self.assertCountEqual([s.slice_key for s in result[0]], [(('age', '[12.0, 18.0)'), ), (('country', 'USA'), ), (('country', 'USA'), ('age', '[12.0, 18.0)'))]) self.assertCountEqual([s.slice_key for s in result[1]], [(('age', '[1.0, 6.0)'), ), (('age', '[6.0, 12.0)'), )])
def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 8) self.assertIn(metric_types.MetricKey(name='example_count'), keys) self.assertIn( metric_types.MetricKey( name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name'), keys)
def testSubKeyStr(self): self.assertEqual(str(metric_types.SubKey(class_id=1)), 'classId:1') self.assertEqual(str(metric_types.SubKey(top_k=2)), 'topK:2') self.assertEqual(str(metric_types.SubKey(k=3)), 'k:3') with self.assertRaises( NotImplementedError, msg= ('A non-existent SubKey should be represented as None, not as ', 'SubKey(None, None, None).')): str(metric_types.SubKey())
def testPlotKeyFromProto(self): plot_keys = [ metric_types.PlotKey(name=''), metric_types.PlotKey(name='', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), metric_types.MetricKey(name='', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(top_k=2)) ] for key in plot_keys: got_key = metric_types.PlotKey.from_proto(key.to_proto()) self.assertEqual(key, got_key, '{} != {}'.format(key, got_key))
def get_metrics_for_all_slices( self, output_name: Text = '', class_id: Optional[int] = None, k: Optional[int] = None, top_k: Optional[int] = None) -> Dict[Text, MetricsByTextKey]: """Get metric names and values for every slice. Args: output_name: The name of the output (optional, only used for multi-output models). class_id: Used with multi-class metrics to identify a specific class ID. k: Used with multi-class metrics to identify the kth predicted value. top_k: Used with multi-class and ranking metrics to identify top-k predicted values. Returns: Dictionary mapping slices to metric names and values. """ if class_id or k or top_k: sub_key = str(metric_types.SubKey(class_id, k, top_k)) else: sub_key = '' sliced_metrics = {} for slicing_metric in self.slicing_metrics: slice_name = slicing_metric[0] metrics = slicing_metric[1][output_name][sub_key] sliced_metrics[slice_name] = { metric_name: metric_value for metric_name, metric_value in metrics.items() } return sliced_metrics # pytype: disable=bad-return-type
def _create_sub_keys( spec: config.MetricsSpec) -> Optional[List[metric_types.SubKey]]: """Creates subkeys associated with spec.""" sub_keys = None if spec.HasField('binarize'): sub_keys = [] if spec.binarize.class_ids: for v in spec.binarize.class_ids: sub_keys.append(metric_types.SubKey(class_id=v)) if spec.binarize.k_list: for v in spec.binarize.k_list: sub_keys.append(metric_types.SubKey(k=v)) if spec.binarize.top_k_list: for v in spec.binarize.top_k_list: sub_keys.append(metric_types.SubKey(top_k=v)) return sub_keys
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey( name='_binary_confusion_matrices_[-inf]', sub_key=metric_types.SubKey(top_k=3)) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual( got_matrices, binary_confusion_matrices.Matrices( thresholds=[float('-inf')], tp=[2.0], fp=[10.0], tn=[6.0], fn=[2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])) except AssertionError as err: raise util.BeamAssertException(err)
def testTFMetricWithClassID(self): computation = tf_metric_wrapper.tf_metric_computations( [tf.keras.metrics.MeanSquaredError(name='mse')], sub_key=metric_types.SubKey(class_id=1), example_weighted=False)[0] example1 = { 'labels': [2], 'predictions': [0.5, 0.0, 0.5], 'example_weights': [0.1] # ignored, example_weighted=False } example2 = { 'labels': [0], 'predictions': [0.2, 0.5, 0.3], 'example_weights': [0.2] # ignored, example_weighted=False } example3 = { 'labels': [1], 'predictions': [0.2, 0.3, 0.5], 'example_weights': [0.3] # ignored, example_weighted=False } example4 = { 'labels': [1], 'predictions': [0.0, 0.9, 0.1], 'example_weights': [0.4] # ignored, example_weighted=False } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) mse_key = metric_types.MetricKey( name='mse', sub_key=metric_types.SubKey(class_id=1), example_weighted=False) self.assertDictElementsAlmostEqual(got_metrics, { mse_key: 0.1875, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) if class_id: sub_key = metric_types.SubKey(class_id=class_id) else: sub_key = metric_types.SubKey( class_id=metric.get_config()['class_id']) key = metric_types.MetricKey(name=metric.name, sub_key=sub_key, example_weighted=True) self.assertDictElementsAlmostEqual(got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err)
def testMetricKeyFromProto(self): metric_keys = [ metric_types.MetricKey(name='metric_name'), metric_types.MetricKey(name='metric_name', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), metric_types.MetricKey( name='metric_name', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(top_k=2), aggregation_type=metric_types.AggregationType( micro_average=True)) ] for key in metric_keys: got_key = metric_types.MetricKey.from_proto(key.to_proto()) self.assertEqual(key, got_key, '{} != {}'.format(key, got_key))
def get_attributions_for_slice( self, slice_name: slicer.SliceKeyType = (), metric_name: str = '', output_name: str = '', class_id: Optional[int] = None, k: Optional[int] = None, top_k: Optional[int] = None ) -> Union[AttributionsByFeatureKey, None]: """Get attribution features names and values for a slice. Args: slice_name: A tuple of the form (column, value), indicating which slice to get attributions from. Optional; if excluded, use overall slice. metric_name: Name of metric to get attributions for. Optional if only one metric used. output_name: The name of the output. Optional, only used for multi-output models. class_id: Used with multi-class models to identify a specific class ID. k: Used with multi-class models to identify the kth predicted value. top_k: Used with multi-class models to identify top-k attribution values. Returns: Dictionary containing feature keys and values for the specified slice. Raises: ValueError: If metric_name is required. """ if class_id or k or top_k: sub_key = str(metric_types.SubKey(class_id, k, top_k)) else: sub_key = '' def equals_slice_name(slice_key): if not slice_key: return not slice_name else: return slice_key == slice_name for sliced_attributions in self.attributions: slice_key = sliced_attributions[0] slice_val = sliced_attributions[1] if equals_slice_name(slice_key): if metric_name: return slice_val[output_name][sub_key][metric_name] elif len(slice_val[output_name][sub_key]) == 1: return list(slice_val[output_name][sub_key].values())[0] else: raise ValueError( 'metric_name must be one of the following: {}'.format( slice_val[output_name][sub_key].keys())) # if slice could not be found, return None return None
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey( name='_calibration_histogram_10000', sub_key=metric_types.SubKey(top_k=2), example_weighted=True) self.assertIn(key, got_plots) got_histogram = got_plots[key] self.assertLen(got_histogram, 5) self.assertEqual( got_histogram[0], calibration_histogram.Bucket( bucket_id=0, weighted_labels=3.0 + 4.0, weighted_predictions=(2 * 1.0 * float('-inf') + 2 * 2.0 * float('-inf') + 2 * 3.0 * float('-inf') + 2 * 4.0 * float('-inf') + -0.1 * 4.0), weighted_examples=(1.0 * 2.0 + 2.0 * 2.0 + 3.0 * 2.0 + 4.0 * 3.0))) self.assertEqual( got_histogram[1], calibration_histogram.Bucket( bucket_id=2001, weighted_labels=0.0 + 0.0, weighted_predictions=0.2 + 3 * 0.2, weighted_examples=1.0 + 3.0)) self.assertEqual( got_histogram[2], calibration_histogram.Bucket( bucket_id=5001, weighted_labels=1.0 + 0.0 * 3.0, weighted_predictions=0.5 * 1.0 + 0.5 * 3.0, weighted_examples=1.0 + 3.0)) self.assertEqual( got_histogram[3], calibration_histogram.Bucket( bucket_id=8001, weighted_labels=0.0 * 2.0 + 1.0 * 2.0, weighted_predictions=0.8 * 2.0 + 0.8 * 2.0, weighted_examples=2.0 + 2.0)) self.assertEqual( got_histogram[4], calibration_histogram.Bucket( bucket_id=10001, weighted_labels=0.0 * 4.0, weighted_predictions=1.1 * 4.0, weighted_examples=4.0)) except AssertionError as err: raise util.BeamAssertException(err)
def _verify_and_update_sub_key(model_name: Text, output_name: Text, sub_key: metric_types.SubKey, metric: _TFMetricOrLoss): """Verifies the multi-class metric key matches settings used by the metric.""" if hasattr(metric, _CLASS_ID_KEY) and metric.class_id is not None: if sub_key and sub_key.class_id != metric.class_id: raise ValueError( '{} tf.keras.metric has class_id = {}, but the metric is being added ' 'using sub_key = {}: model_name={}, output_name={}'.format( metric.name, metric.class_id, sub_key, model_name, output_name)) return metric_types.SubKey(class_id=metric.class_id) elif hasattr(metric, _TOP_K_KEY) and metric.top_k is not None: if sub_key and sub_key.top_k != metric.top_k: raise ValueError( '{} tf.keras.metric has top_k = {}, but the metric is being added ' 'using sub_key = {}: model_name={}, output_name={}'.format( metric.name, metric.top_k, sub_key, model_name, output_name)) return metric_types.SubKey(top_k=metric.top_k) else: return sub_key
def testMetricKeyStrForMetricKeyWithAllFields(self): self.assertEqual( str( metric_types.MetricKey(name='metric_name', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True)), 'name: "metric_name" output_name: "output_name" ' + 'sub_key: { class_id: { value: 1 } } model_name: "model_name" ' + 'is_diff: true')
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric_types.MetricKey( name=metric_name, sub_key=metric_types.SubKey(top_k=top_k)) self.assertDictElementsAlmostEqual( got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err)
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) mse_key = metric_types.MetricKey( name='mse', sub_key=metric_types.SubKey(class_id=1)) self.assertDictElementsAlmostEqual(got_metrics, { mse_key: 0.1875, }) except AssertionError as err: raise util.BeamAssertException(err)
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey( name='_calibration_histogram_10000', sub_key=metric_types.SubKey(k=2), example_weighted=True) self.assertIn(key, got_plots) got_histogram = got_plots[key] self.assertLen(got_histogram, 5) self.assertEqual( got_histogram[0], calibration_histogram.Bucket( bucket_id=0, weighted_labels=0.0 * 4.0, weighted_predictions=-0.2 * 4.0, weighted_examples=4.0)) self.assertEqual( got_histogram[1], calibration_histogram.Bucket( bucket_id=1001, weighted_labels=1.0 + 7 * 1.0, weighted_predictions=0.1 + 7 * 0.1, weighted_examples=1.0 + 7.0)) self.assertEqual( got_histogram[2], calibration_histogram.Bucket( bucket_id=4001, weighted_labels=1.0 * 3.0 + 0.0 * 5.0, weighted_predictions=0.4 * 3.0 + 0.4 * 5.0, weighted_examples=3.0 + 5.0)) self.assertEqual( got_histogram[3], calibration_histogram.Bucket( bucket_id=7001, weighted_labels=0.0 * 2.0 + 0.0 * 6.0, weighted_predictions=0.7 * 2.0 + 0.7 * 6.0, weighted_examples=2.0 + 6.0)) self.assertEqual( got_histogram[4], calibration_histogram.Bucket( bucket_id=10001, weighted_labels=0.0 * 8.0, weighted_predictions=1.05 * 8.0, weighted_examples=8.0)) except AssertionError as err: raise util.BeamAssertException(err)
def testStandardMetricInputsWithClassIDToNumpy(self): example = metric_types.StandardMetricInputs( label={'output_name': np.array([2])}, prediction={'output_name': np.array([0, 0.5, 0.3, 0.9])}, example_weight={'output_name': np.array([1.0])}) got_label, got_pred, got_example_weight = next( metric_util.to_label_prediction_example_weight( example, output_name='output_name', sub_key=metric_types.SubKey(class_id=2))) self.assertAllClose(got_label, np.array([1.0])) self.assertAllClose(got_pred, np.array([0.3])) self.assertAllClose(got_example_weight, np.array([1.0]))
def testStandardMetricInputsWithTopKToNumpy(self): example = metric_types.StandardMetricInputs( {'output_name': np.array([1])}, {'output_name': np.array([0, 0.5, 0.3, 0.9])}, {'output_name': np.array([1.0])}) got_label, got_pred, got_example_weight = ( metric_util.to_label_prediction_example_weight( example, output_name='output_name', sub_key=metric_types.SubKey(top_k=2))) self.assertAllClose(got_label, np.array([0.0, 1.0])) self.assertAllClose(got_pred, np.array([0.9, 0.5])) self.assertAllClose(got_example_weight, np.array([1.0]))
def testStandardMetricInputsWithTopKToNumpy(self): example = metric_types.StandardMetricInputs( label={'output_name': np.array([1])}, prediction={'output_name': np.array([0, 0.5, 0.3, 0.9])}, example_weight={'output_name': np.array([1.0])}) iterable = metric_util.to_label_prediction_example_weight( example, output_name='output_name', sub_key=metric_types.SubKey(top_k=2)) for expected_label, expected_prediction in zip((0.0, 1.0), (0.9, 0.5)): got_label, got_pred, got_example_weight = next(iterable) self.assertAllClose(got_label, np.array([expected_label])) self.assertAllClose(got_pred, np.array([expected_prediction])) self.assertAllClose(got_example_weight, np.array([1.0]))
def _ndcg(gain_key: str, top_k_list: Optional[List[int]] = None, name: str = NDCG_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_names: Optional[List[str]] = None, output_names: Optional[List[str]] = None, sub_keys: Optional[List[metric_types.SubKey]] = None, example_weighted: bool = False, query_key: str = '') -> metric_types.MetricComputations: """Returns metric computations for NDCG.""" if not query_key: raise ValueError('a query_key is required to use NDCG metric') sub_keys = [k for k in sub_keys if k is not None] if top_k_list: if sub_keys is None: sub_keys = [] for k in top_k_list: if not any([sub_key.top_k == k for sub_key in sub_keys]): sub_keys.append(metric_types.SubKey(top_k=k)) if not sub_keys or any([sub_key.top_k is None for sub_key in sub_keys]): raise ValueError( 'top_k values are required to use NDCG metric: {}'.format(sub_keys)) computations = [] for model_name in model_names if model_names else ['']: for output_name in output_names if output_names else ['']: keys = [] for sub_key in sub_keys: keys.append( metric_types.MetricKey( name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted)) computations.append( metric_types.MetricComputation( keys=keys, preprocessor=metric_types.FeaturePreprocessor( feature_keys=[query_key, gain_key]), combiner=_NDCGCombiner( metric_keys=keys, eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, query_key=query_key, gain_key=gain_key))) return computations
def testStandardMetricInputsWithTopKAndAggregationTypeToNumpy(self): example = metric_types.StandardMetricInputs( labels={'output_name': np.array([1])}, predictions={'output_name': np.array([0, 0.5, 0.3, 0.9])}, example_weights={'output_name': np.array([1.0])}) iterator = metric_util.to_label_prediction_example_weight( example, output_name='output_name', sub_key=metric_types.SubKey(top_k=2), aggregation_type=metric_types.AggregationType(micro_average=True)) for expected_label, expected_prediction in zip((1.0, 0.0), (0.5, 0.9)): got_label, got_pred, got_example_weight = next(iterator) self.assertAllClose(got_label, np.array([expected_label])) self.assertAllClose(got_pred, np.array([expected_prediction])) self.assertAllClose(got_example_weight, np.array([1.0]))
def testMacroAverage(self): metric_name = 'test' class_ids = [0, 1, 2] sub_keys = [metric_types.SubKey(class_id=i) for i in class_ids] sub_key_values = [0.1, 0.2, 0.3] computations = aggregation.macro_average( metric_name, sub_keys, eval_config=config_pb2.EvalConfig(), class_weights={ 0: 1.0, 1: 1.0, 2: 1.0 }) metric = computations[0] sub_metrics = {} for sub_key, value in zip(sub_keys, sub_key_values): key = metric_types.MetricKey(name=metric_name, sub_key=sub_key) sub_metrics[key] = value with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = (pipeline | 'Create' >> beam.Create([((), sub_metrics)]) | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] expected_value = (0.1 + 0.2 + 0.3) / 3.0 self.assertDictElementsAlmostEqual(got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) expected = {} for name, value in expected_values.items(): sub_key = None if '@' in name: sub_key = metric_types.SubKey( top_k=int(name.split('@')[1])) key = metric_types.MetricKey(name=name, sub_key=sub_key) expected[key] = value self.assertDictElementsAlmostEqual(got_metrics, expected) except AssertionError as err: raise util.BeamAssertException(err)