def testRunModelAnalysisWithPlots(self): model_location = self._exportEvalSavedModel( fixed_prediction_estimator.simple_fixed_prediction_estimator) examples = [ self._makeExample(prediction=0, label=1.0), self._makeExample(prediction=0.7, label=0.0), self._makeExample(prediction=0.8, label=1.0), self._makeExample(prediction=1.0, label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) eval_result = model_eval_lib.run_model_analysis( model_location, data_location, add_metrics_callbacks=[post_export_metrics.auc_plots()]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected_metrics = { (): { metric_keys.EXAMPLE_COUNT: 4.0, } } expected_plots = { (): { metric_keys.AUC_PLOTS_MATRICES: [(8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])], } } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected_metrics) self.assertPlotsAlmostEqual(eval_result.plots, expected_plots)
def testEvaluateWithPlots(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=0.7, label=0.0) example3 = self._makeExample(prediction=0.8, label=1.0) example4 = self._makeExample(prediction=1.0, label=1.0) metrics, plots = ( pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.auc_plots() ])) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.EXAMPLE_COUNT: 4.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_metrics, label='metrics') def check_plots(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictMatrixRowsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.AUC_PLOTS_MATRICES: [(8001, [ 2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0 ])], }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(plots, check_plots, label='plots')
def testAucPlotsUnweighted(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) examples = [ self._makeExample(prediction=0.0000, label=0.0000), self._makeExample(prediction=0.0000, label=1.0000), self._makeExample(prediction=0.7000, label=1.0000), self._makeExample(prediction=0.8000, label=0.0000), self._makeExample(prediction=1.0000, label=1.0000), ] def check_result(got): # pylint: disable=invalid-name try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertIn(metric_keys.AUC_PLOTS_MATRICES, value) matrices = value[metric_keys.AUC_PLOTS_MATRICES] # | | --------- Threshold ----------- # true label | pred | -1e-6 | 0.0 | 0.7 | 0.8 | 1.0 # - | 0.0 | FP | TN | TN | TN | TN # + | 0.0 | TP | FN | FN | FN | FN # + | 0.7 | TP | TP | FN | FN | FN # - | 0.8 | FP | FP | FP | TN | TN # + | 1.0 | TP | TP | TP | TP | FN self.assertSequenceAlmostEqual(matrices[0], [0, 0, 2, 3, 3.0 / 5.0, 1.0]) self.assertSequenceAlmostEqual( matrices[1], [1, 1, 1, 2, 2.0 / 3.0, 2.0 / 3.0]) self.assertSequenceAlmostEqual( matrices[7001], [2, 1, 1, 1, 1.0 / 2.0, 1.0 / 3.0]) self.assertSequenceAlmostEqual( matrices[8001], [2, 2, 0, 1, 1.0 / 1.0, 1.0 / 3.0]) self.assertSequenceAlmostEqual( matrices[10001], [3, 2, 0, 0, float('nan'), 0.0]) self.assertIn(metric_keys.AUC_PLOTS_THRESHOLDS, value) thresholds = value[metric_keys.AUC_PLOTS_THRESHOLDS] self.assertAlmostEqual(0.0, thresholds[1]) self.assertAlmostEqual(0.001, thresholds[11]) self.assertAlmostEqual(0.005, thresholds[51]) self.assertAlmostEqual(0.010, thresholds[101]) self.assertAlmostEqual(0.100, thresholds[1001]) self.assertAlmostEqual(0.800, thresholds[8001]) self.assertAlmostEqual(1.000, thresholds[10001]) except AssertionError as err: raise util.BeamAssertException(err) self._runTestWithCustomCheck(examples, eval_export_dir, [post_export_metrics.auc_plots()], custom_plots_check=check_result)
def main(): tf.logging.set_verbosity(tf.logging.INFO) args = parse_arguments() tfma_result = run_tfma(input_csv=args.input_csv, tfma_run_dir=args.tfma_run_dir, eval_model_base_dir=args.eval_model_dir, slice_spec=ALL_SPECS, working_dir=args.tfma_run_dir, mode=args.mode, project=args.project, setup_file=args.setup_file, add_metrics_callbacks=[ post_export_metrics.calibration_plot_and_prediction_histogram(), post_export_metrics.auc_plots()] )
def main(): tf.logging.set_verbosity(tf.logging.INFO) args = parse_arguments() tfma_result = run_tfma( input_csv=args.input_csv, tfma_run_dir=args.tfma_run_dir, eval_model_base_dir=args.eval_model_dir, slice_spec=ALL_SPECS, working_dir=args.tfma_run_dir, mode=args.mode, project=args.project, setup_file=args.setup_file, add_metrics_callbacks=[ post_export_metrics.calibration_plot_and_prediction_histogram(), post_export_metrics.auc_plots() ])
def process_tfma(eval_result_dir, schema_file, input_csv=None, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None): """Runs a batch job to evaluate the eval_model against the given input. Args: eval_result_dir: A directory where the evaluation result should be written to. schema_file: A file containing a text-serialized Schema that describes the eval data. input_csv: A path to a csv file which should be the input for evaluation. This can only be set if big_query_table is None. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if input_csv == big_query_table and input_csv is None: raise ValueError( 'one of --input_csv or --big_query_table should be provided.') slice_spec = [ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['trip_start_hour']) ] schema = taxi.read_schema(schema_file) with beam.Pipeline(argv=pipeline_args) as pipeline: if input_csv: csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_csv, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) else: assert big_query_table query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True)) | 'CleanData' >> beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=eval_model_dir, slice_spec=slice_spec, add_metrics_callbacks=[ post_export_metrics.calibration_plot_and_prediction_histogram(), post_export_metrics.auc_plots() ], output_path=eval_result_dir))
def testAucPlotSerialization(self): # Auc for the model # {prediction:0.3, true_label:+}, # {prediction:0.7, true_label:-} # # These plots were generated by hand. For this test to make sense # it must actually match the kind of output the TFMA produces. tfma_plots = { metric_keys.AUC_PLOTS_MATRICES: np.array([ [0.0, 0.0, 1.0, 1.0, 0.5, 1.0], [0.0, 0.0, 1.0, 1.0, 0.5, 1.0], [1.0, 0.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0, 0.0], ]), metric_keys.AUC_PLOTS_THRESHOLDS: np.array([1e-6, 0, 0.5, 1.0]), } expected_plot_data = """ confusion_matrix_at_thresholds { matrices { threshold: 1e-6 true_positives: 1.0 false_positives: 1.0 true_negatives: 0.0 false_negatives: 0.0 precision: 0.5 recall: 1.0 } } confusion_matrix_at_thresholds { matrices { threshold: 0 true_positives: 1.0 false_positives: 1.0 true_negatives: 0.0 false_negatives: 0.0 precision: 0.5 recall: 1.0 } } confusion_matrix_at_thresholds { matrices { threshold: 0.5 true_positives: 0.0 false_positives: 1.0 true_negatives: 0.0 false_negatives: 1.0 precision: 0.0 recall: 0.0 } } confusion_matrix_at_thresholds { matrices { threshold: 1.0 true_positives: 0.0 false_positives: 0.0 true_negatives: 1.0 false_negatives: 1.0 precision: 0.0 recall: 0.0 } } """ plot_data = metrics_for_slice_pb2.PlotData() auc_plots = post_export_metrics.auc_plots() auc_plots.populate_plots_and_pop(tfma_plots, plot_data) self.assertProtoEquals(expected_plot_data, plot_data) self.assertFalse(metric_keys.AUC_PLOTS_MATRICES in tfma_plots) self.assertFalse(metric_keys.AUC_PLOTS_THRESHOLDS in tfma_plots)
def main(): tf.logging.set_verbosity(tf.logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( '--eval_model_dir', help='Input path to the model which will be evaluated.') parser.add_argument( '--eval_result_dir', help='Output directory in which the model analysis result is written.') parser.add_argument( '--big_query_table', help='BigQuery path to input examples which will be evaluated.') parser.add_argument( '--input_csv', help='CSV file containing raw data which will be evaluated.') parser.add_argument('--max_eval_rows', help='Maximum number of rows to evaluate on.', default=None, type=int) known_args, pipeline_args = parser.parse_known_args() if known_args.eval_result_dir: eval_result_dir = known_args.eval_result_dir else: eval_result_dir = tempfile.mkdtemp() slice_spec = [ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['trip_start_hour']) ] with beam.Pipeline(argv=pipeline_args) as pipeline: if known_args.input_csv: csv_coder = taxi.make_csv_coder() raw_data = (pipeline | 'ReadFromText' >> beam.io.ReadFromText( known_args.input_csv, skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) elif known_args.big_query_table: query = taxi.make_sql(known_args.big_query_table, known_args.max_eval_rows, for_eval=True) raw_data = (pipeline | 'ReadBigQuery' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) else: raise ValueError( 'one of --input_csv or --big_query_table should be ' 'provided.') # Examples must be in clean tf-example format. raw_feature_spec = taxi.get_raw_feature_spec() raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) coder = example_proto_coder.ExampleProtoCoder(raw_schema) _ = (raw_data | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict) | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=known_args.eval_model_dir, slice_spec=slice_spec, add_metrics_callbacks=[ post_export_metrics. calibration_plot_and_prediction_histogram(), post_export_metrics.auc_plots() ], output_path=eval_result_dir, desired_batch_size=100))