def compare_eval_results(output_uri: Text, expected_uri: Text, threshold: float, metrics: List[Text]) -> bool: """Compares accuracy on overall dataset using two EvalResult. Args: output_uri: pipeline output artifact uri. expected_uri: recorded pipeline output artifact uri. threshold: a float larger than 0. metrics: metric names to compare. Returns: boolean whether the eval result values differ within a threshold. """ eval_result = tfma.load_eval_result(output_uri) expected_eval_result = tfma.load_eval_result(expected_uri) slice_map = _group_metric_by_slice(eval_result) expected_slice_map = _group_metric_by_slice(expected_eval_result) for metric_name, value in slice_map[()].items(): if metric_name not in metrics: continue expected_value = expected_slice_map[()][metric_name] if not _compare_relative_difference(value, expected_value, threshold): logging.warning('Check following metric: %s', metric_name) return False return True
def _generate_blessing_result(self, eval_examples_uri: Text, slice_spec: List[ tfma.slicer.SingleSliceSpec], current_model_dir: Text, blessed_model_dir: Text) -> bool: current_model_eval_result_path = os.path.join( self._temp_path, constants.CURRENT_MODEL_EVAL_RESULT_PATH) blessed_model_eval_result_path = os.path.join( self._temp_path, constants.BLESSED_MODEL_EVAL_RESULT_PATH) with self._make_beam_pipeline() as pipeline: eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(eval_examples_uri))) current_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( current_model_dir)) (eval_data | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=current_model, slice_spec=slice_spec, output_path=current_model_eval_result_path)) if blessed_model_dir is not None: blessed_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( blessed_model_dir)) (eval_data | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=blessed_model, slice_spec=slice_spec, output_path=blessed_model_eval_result_path)) absl.logging.info( 'all files in current_model_eval_result_path: [%s]', str(tf.io.gfile.listdir(current_model_eval_result_path))) current_model_eval_result = tfma.load_eval_result( output_path=current_model_eval_result_path) if not self._pass_threshold(current_model_eval_result): absl.logging.info('Current model does not pass threshold.') return False absl.logging.info('Current model passes threshold.') if blessed_model_dir is None: absl.logging.info('No blessed model yet.') return True absl.logging.info( 'all files in blessed_model_eval_result: [%s]', str(tf.io.gfile.listdir(blessed_model_eval_result_path))) blessed_model_eval_result = tfma.load_eval_result( output_path=blessed_model_eval_result_path) if (self._compare_eval_result(current_model_eval_result, blessed_model_eval_result)): absl.logging.info('Current model better than blessed model.') return True else: absl.logging.info('Current model worse than blessed model.') return False
def _generate_blessing_result(self, eval_examples_uri, slice_spec, current_model_dir, blessed_model_dir): current_model_eval_result_path = os.path.join( self._temp_path, CURRENT_MODEL_EVAL_RESULT_PATH) blessed_model_eval_result_path = os.path.join( self._temp_path, BLESSED_MODEL_EVAL_RESULT_PATH) with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline: eval_data = (pipeline | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=io_utils.all_files_pattern(eval_examples_uri))) current_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( current_model_dir)) (eval_data | 'EvalCurrentModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=current_model, slice_spec=slice_spec, output_path=current_model_eval_result_path)) if blessed_model_dir is not None: blessed_model = tfma.default_eval_shared_model( eval_saved_model_path=path_utils.eval_model_path( blessed_model_dir)) (eval_data | 'EvalBlessedModel' >> tfma.ExtractEvaluateAndWriteResults( # pylint: disable=expression-not-assigned eval_shared_model=blessed_model, slice_spec=slice_spec, output_path=blessed_model_eval_result_path)) current_model_eval_result = tfma.load_eval_result( output_path=current_model_eval_result_path) if not self._pass_threshold(current_model_eval_result): tf.logging.info('Current model does not pass threshold.') return False tf.logging.info('Current model passes threshold.') if blessed_model_dir is None: tf.logging.info('No blessed model yet.') return True blessed_model_eval_result = tfma.load_eval_result( output_path=blessed_model_eval_result_path) if (self._compare_eval_result(current_model_eval_result, blessed_model_eval_result)): tf.logging.info('Current model better than blessed model.') return True else: tf.logging.info('Current model worse than blessed model.') return False
def test_example_model(self): train_tf_file = self._write_tf_records(self._create_data()) classifier = example_model.train_model(self._model_dir, train_tf_file, LABEL, TEXT_FEATURE, FEATURE_MAP) validate_tf_file = self._write_tf_records(self._create_data()) tfma_eval_result_path = os.path.join(self._model_dir, 'tfma_eval_result') example_model.evaluate_model(classifier, validate_tf_file, tfma_eval_result_path, SLICE, LABEL, FEATURE_MAP) expected_slice_keys = [ 'Overall', 'slice:slice3', 'slice:slice1', 'slice:slice2' ] evaluation_results = tfma.load_eval_result(tfma_eval_result_path) self.assertLen(evaluation_results.slicing_metrics, 4) # Verify if false_positive_rate metrics are computed for all values of # slice. for (slice_key, metric_value) in evaluation_results.slicing_metrics: slice_key = slicer.stringify_slice_key(slice_key) self.assertIn(slice_key, expected_slice_keys) self.assertGreaterEqual( 1.0, metric_value[''][''] ['post_export_metrics/[email protected]'] ['doubleValue']) self.assertLessEqual( 0.0, metric_value[''][''] ['post_export_metrics/[email protected]'] ['doubleValue'])
def generate_static_html_output(output_dir, slicing_columns): result = tfma.load_eval_result(output_path=output_dir) slicing_metrics_views = [ tfma.view.render_slicing_metrics(result, slicing_column=slicing_column) for slicing_column in slicing_columns ] data = embed_data(views=slicing_metrics_views) manager_state = json.dumps(data['manager_state']) widget_views = [json.dumps(view) for view in data['view_specs']] views_html = "" for idx, view in enumerate(widget_views): views_html += _SINGLE_WIDGET_TEMPLATE.format(idx, view) rendered_template = _STATIC_HTML_TEMPLATE.format( manager_state=manager_state, widget_views=views_html) static_html_path = os.path.join(output_dir, _OUTPUT_HTML_FILE) file_io.write_string_to_file(static_html_path, rendered_template) metadata = { 'outputs' : [{ 'type': 'web-app', 'storage': 'gcs', 'source': static_html_path, }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f)
def _load_evaluation(self, file_path: Text) -> Dict[str, Any]: """Returns evaluations for a bechmark run. This method makes following assumptions: 1. `tf.enable_v2_behavior()` was called beforehand. 2. file_path points to a dir containing artifacts of single output model. Args: file_path: A root directory where pipeline's evaluation artifacts are stored. Returns: An evaluation metrics dictionary. If no evaluations found then returns an empty dictionary. """ # We assume this is a single output model, hence the following keys are "". output_name = '' multi_class_key = '' eval_result = tfma.load_eval_result(file_path) # Slicing_metric is a tuple, index 0 is slice, index 1 is its value. _, metrics_dict = eval_result.slicing_metrics[0] if output_name not in metrics_dict or multi_class_key not in metrics_dict[ output_name]: raise ValueError( 'Evaluation can only be loaded for single output model.') metrics_dict = metrics_dict[output_name][multi_class_key] return {k: v.get('doubleValue') for k, v in metrics_dict.items()}
def compare_eval_results(output_uri: Text, expected_uri: Text, threshold: float) -> bool: """Compares accuracy on overall dataset using two EvalResult. Args: output_uri: pipeline output artifact uri. expected_uri: recorded pipeline output artifact uri. threshold: a float between 0 and 1. Returns: boolean whether the eval result values differ within a threshold. """ eval_result = tfma.load_eval_result(output_uri) expected_eval_result = tfma.load_eval_result(expected_uri) slice_map = _group_metric_by_slice(eval_result) expected_slice_map = _group_metric_by_slice(expected_eval_result) for metric_name, value in slice_map[()].items(): expected_value = expected_slice_map[()][metric_name] if not _compare_relative_difference(value, expected_value, threshold): return False return True
def compare_tfma_analysis(self, model_id, other_model_id): """Compares TFMA metrics for `model_id` and `other_model_id`. Args: model_id: A `int` indicating the id of a `TFXArtifactTypes.MODEL` artifact other_model_id: A `int` indicating the id of another `TFXArtifactTypes.MODEL` artifact. Returns: A TimeSeriesViewer object if in Jupyter notebook; None if in Colab. """ tfma_artifact, other_tfma_artifact = (self.get_dest_artifact_of_type( model_id, TFXArtifactTypes.MODEL_EVAL), self.get_dest_artifact_of_type( other_model_id, TFXArtifactTypes.MODEL_EVAL)) if tfma_artifact and other_tfma_artifact: eval_results = tfma.make_eval_results([ tfma.load_eval_result(tfma_artifact.uri), tfma.load_eval_result(other_tfma_artifact.uri) ], tfma.constants.MODEL_CENTRIC_MODE) return tfma.view.render_time_series( eval_results, tfma.slicer.slicer.SingleSliceSpec())
def read_metrics_eval_result( metrics_artifact_uri: Text) -> Optional[tfma.EvalResult]: """Reads TFMA evaluation results from the evaluator output path. Args: metrics_artifact_uri: the output artifact path of a TFMA component. Returns: A TFMA EvalResults named tuple including configs and sliced metrics. Returns None if no slicing metrics found from `metrics_artifact_uri`. """ result = tfma.load_eval_result(metrics_artifact_uri) if not result.slicing_metrics: logging.warning('Cannot load eval results from: %s', metrics_artifact_uri) return None return result
def display_tfma_analysis(self, model_id, slicing_column=None): """Displays TFMA metrics for `model_id` sliced by `slicing_column`. Args: model_id: A `int` indicating the id of a `TFXArtifactTypes.MODEL` artifact slicing_column: (Optional) A `str` indicating the slicing column for the TFMA metrics. Returns: A SlicingMetricsViewer object if in Jupyter notebook; None if in Colab. """ tfma_artifact = self.get_dest_artifact_of_type( model_id, TFXArtifactTypes.MODEL_EVAL) if tfma_artifact: return tfma.view.render_slicing_metrics( tfma.load_eval_result(tfma_artifact.uri), slicing_column=slicing_column)
def _get_evaluation_result(self, request): run = request.args.get('run') try: run = six.ensure_text(run) except (UnicodeDecodeError, AttributeError): pass try: eval_result_output_dir = six.ensure_text( self._multiplexer.Tensors(run, FairnessIndicatorsPlugin.plugin_name) [0].tensor_proto.string_val[0], 'utf-8') eval_result = tfma.load_eval_result(output_path=eval_result_output_dir) # TODO(b/141283811): Allow users to choose different model output names # and class keys in case of multi-output and multi-class model. data = widget_view.convert_eval_result_to_ui_input(eval_result) except (KeyError, json_format.ParseError) as error: logging.info('Error while fetching evaluation data, %s', error) data = [] return http_util.Respond(request, data, content_type='application/json')
def _get_evaluation_result_from_remote_path(self, request): evaluation_output_path = request.args.get('evaluation_output_path') try: evaluation_output_path = six.ensure_text(evaluation_output_path) except (UnicodeDecodeError, AttributeError): pass try: eval_result = tfma.load_eval_result( os.path.dirname(evaluation_output_path), output_file_format=self._get_output_file_format( evaluation_output_path)) data = widget_view.convert_slicing_metrics_to_ui_input( eval_result.slicing_metrics) except (KeyError, json_format.ParseError) as error: logging.info('Error while fetching evaluation data, %s', error) data = [] return http_util.Respond(request, data, content_type='application/json')
def _annotate_eval_results(self, model_card: ModelCard) -> ModelCard: """Annotates a model card with info from TFMA evaluation results. The eval results are annotated as PerformanceMetrics in the model_card. Graphics are also generated and appended to the QuantitativeAnalysis section. EvalResults are read from both TfmaSource or MlmdSource, whichever is provided. Using both may cause duplicates to be recorded. If neither is provided, this function will be a no-op. Args: model_card: The model card object to annotate with TFMA EvalResult metrics. Returns: The model_card with eval result metrics annotated. """ if self._source and self._source.tfma: for eval_result_path in self._source.tfma.eval_result_paths: eval_result = tfma.load_eval_result( output_path=eval_result_path, output_file_format=self._source.tfma.file_format) if eval_result: logging.info('EvalResult found at path %s', eval_result_path) if self._source.tfma.metrics_include or self._source.tfma.metrics_exclude: eval_result = tfx_util.filter_metrics( eval_result, self._source.tfma.metrics_include, self._source.tfma.metrics_exclude) tfx_util.annotate_eval_result_metrics(model_card, eval_result) graphics.annotate_eval_result_plots(model_card, eval_result) else: logging.info('EvalResult not found at path %s', eval_result_path) if self._store: metrics_artifacts = tfx_util.get_metrics_artifacts_for_model( self._store, self._artifact_with_model_uri.id) for metrics_artifact in metrics_artifacts: eval_result = tfx_util.read_metrics_eval_result(metrics_artifact.uri) if eval_result is not None: tfx_util.annotate_eval_result_metrics(model_card, eval_result) graphics.annotate_eval_result_plots(model_card, eval_result) return model_card
def read_metrics_eval_result( metrics_artifact_uri: str, output_file_format: Optional[str] = None) -> Optional[tfma.EvalResult]: """Reads TFMA evaluation results from the evaluator output path. Args: metrics_artifact_uri: the output artifact path of a TFMA component. output_file_format: an optional file format of the payload. Returns: A TFMA EvalResults named tuple including configs and sliced metrics. Returns None if no slicing metrics found from `metrics_artifact_uri`. """ result = tfma.load_eval_result(output_path=metrics_artifact_uri, output_file_format=output_file_format) if not result.slicing_metrics: logging.warning('Cannot load eval results from: %s', metrics_artifact_uri) return None return result
def __init__(self, **params): super(Application, self).__init__(**params) # lists result_list = [] hparam_list = [] repo: Repository = Repository.get_instance() # get all pipelines in this workspace all_pipelines: List[TrainingPipeline] = repo.get_pipelines_by_type( [TrainingPipeline.PIPELINE_TYPE]) # get a dataframe of all results + all hyperparameter combinations for p in all_pipelines: # This is slowing the comparison down but # necessary to update the status of each run if p.get_status() == PipelineStatusTypes.Succeeded.name: eval_path = p.get_artifacts_uri_by_component( GDPComponent.Evaluator.name)[0] evaluation = tfma.load_eval_result(eval_path) for s, m in evaluation.slicing_metrics: result_list.append( dict([('pipeline_name', '{}'.format(p.name)), ('slice_name', s[0][0] if s else ''), ('slice_value', s[0][1] if s else '')])) result_list[-1].update( {f'metric_{k}': m[k][''] for k, v in m.items()}) h_dict = p.get_hyperparameters() h_dict['pipeline_name'] = p.name hparam_list.append(h_dict) self.results = pd.DataFrame([parse_metrics(r) for r in result_list]) self.hparam_info = pd.DataFrame(hparam_list) # set params self.param.pipeline_run_selector.objects = self.results[ 'pipeline_name'].unique()
def run_tfma(slice_spec, eval_model_base_dir, tfma_run_dir, input_csv, working_dir, mode, project, setup_file, add_metrics_callbacks=None): """Does model analysis, using the given spec of how to 'slice', and returns an EvalResult that can be used with TFMA visualization functions. """ print("eval model base dir: %s" % eval_model_base_dir) # Make sure the model dir exists before proceeding, as sometimes it takes a few seconds to become # available after training completes. retries = 0 sleeptime = 5 while retries < 20: try: eval_model_dir = os.path.join( eval_model_base_dir, file_io.list_directory(eval_model_base_dir)[0]) print("eval model dir: %s" % eval_model_dir) if 'temp' not in eval_model_dir: break else: print("Sleeping %s seconds to sync with GCS..." % sleeptime) time.sleep(sleeptime) retries += 1 sleeptime *= 2 except Exception as e: print(e) print("Sleeping %s seconds to sync with GCS..." % sleeptime) time.sleep(sleeptime) retries += 1 sleeptime *= 2 schema = taxi.read_schema('schema.pbtxt') temp_dir = os.path.join(working_dir, 'tmp') if mode == 'local': print("mode == local") options = {'project': project} pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DirectRunner' elif mode == 'cloud': print("mode == cloud") options = { 'job_name': 'tfma-' + str(uuid.uuid4()), 'temp_location': temp_dir, 'project': project, 'save_main_session': True, 'setup_file': setup_file } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) display_only_data_location = input_csv with beam.Pipeline(runner, options=pipeline_options) as pipeline: with beam_impl.Context(temp_dir=temp_dir): csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_csv, # coder=beam.coders.BytesCoder(), skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) raw_data = ( raw_data # | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict) | 'ToSerializedTFExample' >> beam.Map(coder.encode)) _ = raw_data | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=eval_model_dir, slice_spec=slice_spec, output_path=tfma_run_dir, add_metrics_callbacks=add_metrics_callbacks, display_only_data_location=input_csv) return tfma.load_eval_result(output_path=tfma_run_dir)
def run_tfma(slice_spec, eval_model_base_dir, tfma_run_dir, input_csv, working_dir, mode, project, setup_file, add_metrics_callbacks=None): """Does model analysis, using the given spec of how to 'slice', and returns an EvalResult that can be used with TFMA visualization functions. """ print("eval model base dir: %s" % eval_model_base_dir) # Make sure the model dir exists before proceeding, as sometimes it takes a few seconds to become # available after training completes. retries = 0 sleeptime = 5 while retries < 20: try: eval_model_dir = os.path.join( eval_model_base_dir, file_io.list_directory(eval_model_base_dir)[0]) print("eval model dir: %s" % eval_model_dir) if 'temp' not in eval_model_dir: break else: print("Sleeping %s seconds to sync with GCS..." % sleeptime) time.sleep(sleeptime) retries += 1 sleeptime *= 2 except Exception as e: print(e) print("Sleeping %s seconds to sync with GCS..." % sleeptime) time.sleep(sleeptime) retries += 1 sleeptime *= 2 schema = taxi.read_schema('schema.pbtxt') temp_dir = os.path.join(working_dir, 'tmp') if mode == 'local': print("mode == local") options = { 'project': project} pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DirectRunner' elif mode == 'cloud': print("mode == cloud") options = { 'job_name': 'tfma-' + str(uuid.uuid4()), 'temp_location': temp_dir, 'project': project, 'save_main_session': True, 'setup_file': setup_file } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) display_only_data_location = input_csv with beam.Pipeline(runner, options=pipeline_options) as pipeline: with beam_impl.Context(temp_dir=temp_dir): csv_coder = taxi.make_csv_coder(schema) raw_data = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText( input_csv, # coder=beam.coders.BytesCoder(), skip_header_lines=1) | 'ParseCSV' >> beam.Map(csv_coder.decode)) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) raw_data = ( raw_data # | 'CleanData' >> beam.Map(taxi.clean_raw_data_dict) | 'ToSerializedTFExample' >> beam.Map(coder.encode)) _ = raw_data | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults( eval_saved_model_path=eval_model_dir, slice_spec=slice_spec, output_path=tfma_run_dir, add_metrics_callbacks=add_metrics_callbacks, display_only_data_location=input_csv) return tfma.load_eval_result(output_path=tfma_run_dir)
def display(self, artifact: types.Artifact): tfma_result = tfma.load_eval_result(artifact.uri) # TODO(ccy): add comment instructing user to use the TFMA library directly # in order to render non-default slicing metric views. tfma.view.render_slicing_metrics(tfma_result)