def _assertSchemaEqual(self, expected_schema, actual_schema): schema_reader = io_utils.SchemaReader() expected_schema_proto = schema_reader.read( os.path.join(expected_schema.uri, executor._DEFAULT_FILE_NAME)) actual_schema_proto = schema_reader.read( os.path.join(actual_schema.uri, executor._DEFAULT_FILE_NAME)) self.assertProtoEquals(expected_schema_proto, actual_schema_proto)
def _provide_schema(self, input_dict, exec_properties) -> schema_pb2.Schema: """Generates schema from either schema or statistics.""" # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. stats = input_dict.get('stats') or input_dict.get('statistics') schema = input_dict.get('schema') if bool(stats) == bool(schema): raise ValueError( 'Exactly only one of schema or stats must be provided') if schema: schema_uri = artifact_utils.get_single_uri(schema) absl.logging.info('Schema is provided. Reading from %s.' % schema_uri) schema_reader = io_utils.SchemaReader() try: return schema_reader.read( os.path.join(schema_uri, _DEFAULT_FILE_NAME)) except tf.errors.NotFoundError: raise ValueError( 'Schema is provided, but failed to read from %s.' % schema_uri) train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(stats, 'train')) infer_feature_shape = exec_properties['infer_feature_shape'] return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape)
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def testSuccess(self): _executor.Executor().Do({}, self.output_dict, self.exec_properties) reader = io_utils.SchemaReader() expected_proto = reader.read(self.source_file_path) imported_proto = reader.read( os.path.join(self.tmp_dir, schema_gen_executor.DEFAULT_FILE_NAME)) self.assertEqual(expected_proto, imported_proto)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): self._log_startup(input_dict, output_dict, exec_properties) schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) groups = group_stats_and_examples(input_dict) for examples, datasets in groups: datasets = DatasetFeatureStatisticsList( datasets=list(datasets.values())) partitions = lists_to_partitions( datasets, schema, examples, partition_fn(datasets, schema, examples)) for partition in partitions: output_uri = os.path.join( artifact_utils.get_single_uri(output_dict[PARTITIONS_KEY]), partition.name) io_utils.write_pbtxt_file( os.path.join(output_uri, 'schema.pbtxt'), partition.schema) for i in range(0, len(partition.statistics.datasets)): dataset = partition.statistics.datasets[i] example_splits = partition.example_splits[i] io_utils.write_tfrecord_file( os.path.join(output_uri, example_splits.split, 'stats_tfrecord'), dataset)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """ImportSchemaGen executor entrypoint. This generate Schema artifact with given schema_file. Args: input_dict: Should be empty. output_dict: Output dict from key to a list of artifacts, including: - schema: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - schema_file: Source schema file path. Returns: None """ source_file_path = exec_properties.get( standard_component_specs.SCHEMA_FILE_KEY) if not source_file_path: raise ValueError('Schema file path is missing in exec_properties.') output_uri = os.path.join( artifact_utils.get_single_uri( output_dict[standard_component_specs.SCHEMA_KEY]), schema_gen_executor.DEFAULT_FILE_NAME) # Check whether the input file has a proper schema proto. _ = io_utils.SchemaReader().read(source_file_path) io_utils.copy_file(source_file_path, output_uri) logging.info('Copied a schema file from %s to %s.', source_file_path, output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) logging.info('Validating schema against the computed statistics.') split_uris: List[Text] = [] for artifact in input_dict[executor.STATISTICS_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): split_uris.append(split) label_inputs = { labels.STATS: tfdv.load_statistics( io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri( input_dict[executor.STATISTICS_KEY], split_uris[0]))), labels.SCHEMA: io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[executor.SCHEMA_KEY]))) } output_uri = artifact_utils.get_single_uri( output_dict[executor.ANOMALIES_KEY]) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def parse_schema(input_dict: Dict[Text, List[types.Artifact]]): schema = input_dict.get(SCHEMA_KEY, None) if not schema: return schema else: schema_path = os.path.join(artifact_utils.get_single_uri(schema), _DEFAULT_FILE_NAME) schema_reader = io_utils.SchemaReader() parsed_schema = schema_reader.read(schema_path) return parsed_schema
def _GetSchema(self, schema_path: Text) -> schema_pb2.Schema: """Gets a tf.metadata schema. Args: schema_path: Path to schema file. Returns: A tf.metadata schema. """ schema_reader = io_utils.SchemaReader() return schema_reader.read(schema_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of type `standard_artifacts.ExampleStatistics` which should contain the 'eval' split. Stats on other splits are ignored. - schema: A list of type `standard_artifacts.Schema` which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) absl.logging.info('Validating schema against the computed statistics.') label_inputs = { labels.STATS: tfdv.load_statistics( io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(input_dict[STATISTICS_KEY], 'eval'))), labels.SCHEMA: io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) } output_uri = artifact_utils.get_single_uri(output_dict[ANOMALIES_KEY]) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) absl.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json( stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError( 'A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_uris = [] for artifact in input_dict[EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) output_uri = artifact_utils.get_split_uri( output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json( stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError( 'A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_uris = [] for artifact in input_dict[EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) tfxio_kwargs = {'file_pattern': input_uri} # TODO(b/151624179): clean this up after tfx_bsl is released with the # below flag. if getattr(tfxio, 'TFXIO_HAS_TELEMETRY', False): tfxio_kwargs[ 'telemetry_descriptors'] = _TELEMETRY_DESCRIPTORS input_tfxio = tf_example_record.TFExampleRecord(**tfxio_kwargs) output_uri = artifact_utils.get_split_uri( output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[{}]'.format( split) >> input_tfxio.BeamSource() # TODO(b/153368237): Clean this up after a release post tfx 0.21. if not getattr(tfdv, 'TFDV_ACCEPT_RECORD_BATCH', False): data |= 'RecordBatchToTable[{}]'.format(split) >> beam.Map( lambda rb: pa.Table.from_batches([rb])) _ = (data | 'GenerateStatistics[{}]'.format(split) >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[{}]'.format(split) >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - evaluation: model evaluation results. exec_properties: A dict of execution properties. - eval_config: JSON string of tfma.EvalConfig. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Deprecated, use eval_config.slicing_specs instead. - example_splits: JSON-serialized list of names of splits on which the metrics are computed. Default behavior (when example_splits is set to None) is using the 'eval' split. Returns: None """ if standard_component_specs.EXAMPLES_KEY not in input_dict: raise ValueError('EXAMPLES_KEY is missing from input dict.') if standard_component_specs.EVALUATION_KEY not in output_dict: raise ValueError('EVALUATION_KEY is missing from output dict.') if standard_component_specs.MODEL_KEY in input_dict and len( input_dict[standard_component_specs.MODEL_KEY]) > 1: raise ValueError('There can be only one candidate model, there are %d.' % (len(input_dict[standard_component_specs.MODEL_KEY]))) if standard_component_specs.BASELINE_MODEL_KEY in input_dict and len( input_dict[standard_component_specs.BASELINE_MODEL_KEY]) > 1: raise ValueError( 'There can be only one baseline model, there are %d.' % (len(input_dict[standard_component_specs.BASELINE_MODEL_KEY]))) self._log_startup(input_dict, output_dict, exec_properties) # Add fairness indicator metric callback if necessary. fairness_indicator_thresholds = json_utils.loads( exec_properties.get( standard_component_specs.FAIRNESS_INDICATOR_THRESHOLDS_KEY, 'null')) add_metrics_callbacks = None if fairness_indicator_thresholds: add_metrics_callbacks = [ tfma.post_export_metrics.fairness_indicators( # pytype: disable=module-attr thresholds=fairness_indicator_thresholds), ] output_uri = artifact_utils.get_single_uri( output_dict[constants.EVALUATION_KEY]) # Make sure user packages get propagated to the remote Beam worker. unused_module_path, extra_pip_packages = udf_utils.decode_user_module_key( exec_properties.get(standard_component_specs.MODULE_PATH_KEY, None)) for pip_package_path in extra_pip_packages: local_pip_package_path = io_utils.ensure_local(pip_package_path) self._beam_pipeline_args.append('--extra_package=%s' % local_pip_package_path) eval_shared_model_fn = udf_utils.try_get_fn( exec_properties=exec_properties, fn_name='custom_eval_shared_model') or tfma.default_eval_shared_model run_validation = False models = [] if (standard_component_specs.EVAL_CONFIG_KEY in exec_properties and exec_properties[standard_component_specs.EVAL_CONFIG_KEY]): slice_spec = None has_baseline = bool( input_dict.get(standard_component_specs.BASELINE_MODEL_KEY)) eval_config = tfma.EvalConfig() proto_utils.json_to_proto( exec_properties[standard_component_specs.EVAL_CONFIG_KEY], eval_config) # rubber_stamp is always assumed true, i.e., change threshold will always # be ignored when a baseline model is missing. if hasattr(tfma, 'utils'): eval_config = tfma.utils.update_eval_config_with_defaults( eval_config, has_baseline=has_baseline, rubber_stamp=True) tfma.utils.verify_eval_config(eval_config) else: # TODO(b/171992041): Replaced by tfma.utils. eval_config = tfma.update_eval_config_with_defaults( eval_config, has_baseline=has_baseline, rubber_stamp=True) tfma.verify_eval_config(eval_config) # Do not validate model when there is no thresholds configured. This is to # avoid accidentally blessing models when users forget to set thresholds. run_validation = bool( tfma.metrics.metric_thresholds_from_metrics_specs( eval_config.metrics_specs, eval_config=eval_config)) if len(eval_config.model_specs) > 2: raise ValueError( """Cannot support more than two models. There are %d models in this eval_config.""" % (len(eval_config.model_specs))) # Extract model artifacts. for model_spec in eval_config.model_specs: if standard_component_specs.MODEL_KEY not in input_dict: if not model_spec.prediction_key: raise ValueError( 'model_spec.prediction_key required if model not provided') continue if model_spec.is_baseline: model_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.BASELINE_MODEL_KEY]) else: model_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.MODEL_KEY]) # TODO(b/171992041): tfma.get_model_type replaced by tfma.utils. if ((hasattr(tfma, 'utils') and tfma.utils.get_model_type(model_spec) == tfma.TF_ESTIMATOR) or hasattr(tfma, 'get_model_type') and tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR): model_path = path_utils.eval_model_path( model_artifact.uri, path_utils.is_old_model_artifact(model_artifact)) else: model_path = path_utils.serving_model_path( model_artifact.uri, path_utils.is_old_model_artifact(model_artifact)) logging.info('Using %s as %s model.', model_path, model_spec.name) models.append( eval_shared_model_fn( eval_saved_model_path=model_path, model_name=model_spec.name, eval_config=eval_config, add_metrics_callbacks=add_metrics_callbacks)) else: eval_config = None assert (standard_component_specs.FEATURE_SLICING_SPEC_KEY in exec_properties and exec_properties[standard_component_specs.FEATURE_SLICING_SPEC_KEY] ), 'both eval_config and feature_slicing_spec are unset.' feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() proto_utils.json_to_proto( exec_properties[standard_component_specs.FEATURE_SLICING_SPEC_KEY], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) model_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.MODEL_KEY]) model_path = path_utils.eval_model_path( model_artifact.uri, path_utils.is_old_model_artifact(model_artifact)) logging.info('Using %s for model eval.', model_path) models.append( eval_shared_model_fn( eval_saved_model_path=model_path, model_name='', eval_config=None, add_metrics_callbacks=add_metrics_callbacks)) eval_shared_model = models[0] if len(models) == 1 else models schema = None if standard_component_specs.SCHEMA_KEY in input_dict: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.SCHEMA_KEY]))) # Load and deserialize example splits from execution properties. example_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXAMPLE_SPLITS_KEY, 'null')) if not example_splits: example_splits = ['eval'] logging.info("The 'example_splits' parameter is not set, using 'eval' " 'split.') logging.info('Evaluating model.') # TempPipInstallContext is needed here so that subprocesses (which # may be created by the Beam multi-process DirectRunner) can find the # needed dependencies. # TODO(b/187122662): Move this to the ExecutorOperator or Launcher. with udf_utils.TempPipInstallContext(extra_pip_packages): with self._make_beam_pipeline() as pipeline: examples_list = [] tensor_adapter_config = None # pylint: disable=expression-not-assigned if tfma.is_batched_input(eval_shared_model, eval_config): tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=input_dict[standard_component_specs.EXAMPLES_KEY], telemetry_descriptors=_TELEMETRY_DESCRIPTORS, schema=schema, raw_record_column_name=tfma_constants.ARROW_INPUT_COLUMN) # TODO(b/161935932): refactor after TFXIO supports multiple patterns. for split in example_splits: split_uris = artifact_utils.get_split_uris( input_dict[standard_component_specs.EXAMPLES_KEY], split) for index in range(len(split_uris)): split_uri = split_uris[index] file_pattern = io_utils.all_files_pattern(split_uri) tfxio = tfxio_factory(file_pattern) data = ( pipeline | f'ReadFromTFRecordToArrow[{split}][{index}]' >> tfxio.BeamSource()) examples_list.append(data) if schema is not None: # Use last tfxio as TensorRepresentations and ArrowSchema are fixed. tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfxio.ArrowSchema(), tensor_representations=tfxio.TensorRepresentations()) else: for split in example_splits: split_uris = artifact_utils.get_split_uris( input_dict[standard_component_specs.EXAMPLES_KEY], split) for index in range(len(split_uris)): split_uri = split_uris[index] file_pattern = io_utils.all_files_pattern(split_uri) data = ( pipeline | f'ReadFromTFRecord[{split}][{index}]' >> beam.io.ReadFromTFRecord(file_pattern=file_pattern)) examples_list.append(data) custom_extractors = udf_utils.try_get_fn( exec_properties=exec_properties, fn_name='custom_extractors') extractors = None if custom_extractors: extractors = custom_extractors( eval_shared_model=eval_shared_model, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config) (examples_list | 'FlattenExamples' >> beam.Flatten() | 'ExtractEvaluateAndWriteResults' >> (tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models[0] if len(models) == 1 else models, eval_config=eval_config, extractors=extractors, output_path=output_uri, slice_spec=slice_spec, tensor_adapter_config=tensor_adapter_config))) logging.info('Evaluation complete. Results written to %s.', output_uri) if not run_validation: # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported. logging.info('No threshold configured, will not validate model.') return # Set up blessing artifact blessing = artifact_utils.get_single_instance( output_dict[standard_component_specs.BLESSING_KEY]) blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY, artifact_utils.get_single_uri( input_dict[standard_component_specs.MODEL_KEY])) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, input_dict[standard_component_specs.MODEL_KEY][0].id) if input_dict.get(standard_component_specs.BASELINE_MODEL_KEY): baseline_model = input_dict[ standard_component_specs.BASELINE_MODEL_KEY][0] blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY, baseline_model.uri) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id) if 'current_component_id' in exec_properties: blessing.set_string_custom_property( 'component_id', exec_properties['current_component_id']) # Check validation result and write BLESSED file accordingly. logging.info('Checking validation results.') validation_result = tfma.load_validation_result(output_uri) if validation_result.validation_ok: io_utils.write_string_file( os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '') blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.BLESSED_VALUE) else: io_utils.write_string_file( os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '') blessing.set_int_custom_property(constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.NOT_BLESSED_VALUE) logging.info('Blessing result %s written to %s.', validation_result.validation_ok, blessing.uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json(stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError('A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_and_tfxio = [] for artifact in input_dict[EXAMPLES_KEY]: tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=artifact, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) for split in artifact_utils.decode_split_names(artifact.split_names): uri = os.path.join(artifact.uri, split) split_and_tfxio.append( (split, tfxio_factory(io_utils.all_files_pattern(uri)))) with self._make_beam_pipeline() as p: for split, tfxio in split_and_tfxio: absl.logging.info('Generating statistics for split {}'.format(split)) output_uri = artifact_utils.get_split_uri(output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[{}]'.format(split) >> tfxio.BeamSource() _ = ( data | 'GenerateStatistics[{}]'.format(split) >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[{}]'.format(split) >> stats_api.WriteStatisticsToTFRecord(output_path)) absl.logging.info('Statistics for split {} written to {}.'.format( split, output_uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: # Check the inputs if constants.EXAMPLES not in input_dict: raise ValueError(f'{constants.EXAMPLES} is missing from inputs') examples_artifact = input_dict[constants.EXAMPLES] input_uri = artifact_utils.get_single_uri(examples_artifact) if len(zenml_path_utils.list_dir(input_uri)) == 0: raise AssertionError( 'ZenML can not run the evaluation as the provided input ' 'configuration does not point towards any data. Specifically, ' 'if you are using the agnostic evaluator, please make sure ' 'that you are using a proper test_fn in your trainer step to ' 'write these results.') else: # Check the outputs if constants.EVALUATION not in output_dict: raise ValueError( f'{constants.EVALUATION} is missing from outputs') evaluation_artifact = output_dict[constants.EVALUATION] output_uri = artifact_utils.get_single_uri(evaluation_artifact) # Resolve the schema schema = None if constants.SCHEMA in input_dict: schema_artifact = input_dict[constants.SCHEMA] schema_uri = artifact_utils.get_single_uri(schema_artifact) reader = io_utils.SchemaReader() schema = reader.read(io_utils.get_only_uri_in_dir(schema_uri)) # Create the step with the schema attached if provided source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) evaluator_step: BaseEvaluatorStep = c(**args) # Check the execution parameters eval_config = evaluator_step.build_config() eval_config = tfma.update_eval_config_with_defaults(eval_config) tfma.verify_eval_config(eval_config) # Resolve the model if constants.MODEL in input_dict: model_artifact = input_dict[constants.MODEL] model_uri = artifact_utils.get_single_uri(model_artifact) model_path = path_utils.serving_model_path(model_uri) model_fn = try_get_fn(evaluator_step.CUSTOM_MODULE, 'custom_eval_shared_model' ) or tfma.default_eval_shared_model eval_shared_model = model_fn( model_name='', # TODO: Fix with model names eval_saved_model_path=model_path, eval_config=eval_config) else: eval_shared_model = None self._log_startup(input_dict, output_dict, exec_properties) # Main pipeline logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: examples_list = [] tensor_adapter_config = None if tfma.is_batched_input(eval_shared_model, eval_config): tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[ artifact_utils.get_single_instance( examples_artifact) ], telemetry_descriptors=_TELEMETRY_DESCRIPTORS, schema=schema, raw_record_column_name=tfma_constants. ARROW_INPUT_COLUMN) for split in evaluator_step.splits: file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri( examples_artifact, split)) tfxio = tfxio_factory(file_pattern) data = (pipeline | 'ReadFromTFRecordToArrow[%s]' % split >> tfxio.BeamSource()) examples_list.append(data) if schema is not None: tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfxio.ArrowSchema(), tensor_representations=tfxio.TensorRepresentations( )) else: for split in evaluator_step.splits: file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri( examples_artifact, split)) data = (pipeline | 'ReadFromTFRecord[%s]' % split >> beam.io. ReadFromTFRecord(file_pattern=file_pattern)) examples_list.append(data) # Resolve custom extractors custom_extractors = try_get_fn(evaluator_step.CUSTOM_MODULE, 'custom_extractors') extractors = None if custom_extractors: extractors = custom_extractors( eval_shared_model=eval_shared_model, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config) # Resolve custom evaluators custom_evaluators = try_get_fn(evaluator_step.CUSTOM_MODULE, 'custom_evaluators') evaluators = None if custom_evaluators: evaluators = custom_evaluators( eval_shared_model=eval_shared_model, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config) # Extract, evaluate and write (examples_list | 'FlattenExamples' >> beam.Flatten() | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, output_path=output_uri, extractors=extractors, evaluators=evaluators, tensor_adapter_config=tensor_adapter_config)) logging.info('Evaluation complete. Results written to %s.', output_uri)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """TensorFlow ExampleValidator executor entrypoint. This validates statistics against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - statistics: A list of type `standard_artifacts.ExampleStatistics` generated by StatisticsGen. - schema: A list of type `standard_artifacts.Schema` which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'standard_artifacts.ExampleAnomalies' of size one. It will include a single binary proto file which contains all anomalies found. exec_properties: A dict of execution properties. - exclude_splits: JSON-serialized list of names of splits that the example validator should not validate. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError( 'exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Setup output splits. stats_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.STATISTICS_KEY]) stats_split_names = artifact_utils.decode_split_names( stats_artifact.split_names) split_names = [ split for split in stats_split_names if split not in exclude_splits ] anomalies_artifact = artifact_utils.get_single_instance( output_dict[standard_component_specs.ANOMALIES_KEY]) anomalies_artifact.split_names = artifact_utils.encode_split_names( split_names) schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.SCHEMA_KEY]))) for split in artifact_utils.decode_split_names( stats_artifact.split_names): if split in exclude_splits: continue logging.info( 'Validating schema against the computed statistics for ' 'split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri([stats_artifact], split)) if artifact_utils.is_artifact_version_older_than( stats_artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE): # pylint: disable=protected-access stats = tfdv.load_statistics(stats_uri) else: stats = tfdv.load_stats_binary(stats_uri) label_inputs = { standard_component_specs.STATISTICS_KEY: stats, standard_component_specs.SCHEMA_KEY: schema } output_uri = artifact_utils.get_split_uri( output_dict[standard_component_specs.ANOMALIES_KEY], split) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) logging.info( 'Validation complete for split %s. Anomalies written to ' '%s.', split, output_uri)
def get_schema_proto(artifact_uri: Text): schema_path = os.path.join(artifact_uri, 'schema.pbtxt') schema = io_utils.SchemaReader().read(schema_path) return schema
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - statistics: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. - exclude_splits: JSON-serialized list of names of splits where statistics and sample should not be generated. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Setup output splits. examples = artifact_utils.get_single_instance( input_dict[standard_component_specs.EXAMPLES_KEY]) examples_split_names = artifact_utils.decode_split_names( examples.split_names) split_names = [ split for split in examples_split_names if split not in exclude_splits ] statistics_artifact = artifact_utils.get_single_instance( output_dict[standard_component_specs.STATISTICS_KEY]) statistics_artifact.split_names = artifact_utils.encode_split_names( split_names) stats_options = options.StatsOptions() stats_options_json = exec_properties.get( standard_component_specs.STATS_OPTIONS_JSON_KEY) if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json(stats_options_json) if input_dict.get(standard_component_specs.SCHEMA_KEY): if stats_options.schema: raise ValueError('A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.SCHEMA_KEY]))) stats_options.schema = schema split_and_tfxio = [] tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[examples], telemetry_descriptors=_TELEMETRY_DESCRIPTORS) for split in artifact_utils.decode_split_names(examples.split_names): if split in exclude_splits: continue uri = artifact_utils.get_split_uri([examples], split) split_and_tfxio.append( (split, tfxio_factory(io_utils.all_files_pattern(uri)))) with self._make_beam_pipeline() as p: for split, tfxio in split_and_tfxio: logging.info('Generating statistics for split %s.', split) output_uri = artifact_utils.get_split_uri( output_dict[standard_component_specs.STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource() _ = ( data | 'GenerateStatistics[%s]' % split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[%s]' % split >> stats_api.WriteStatisticsToBinaryFile(output_path)) logging.info('Statistics for split %s written to %s.', split, output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of type `standard_artifacts.ExampleStatistics` which should contain the 'eval' split. Stats on other splits are ignored. - schema: A list of type `standard_artifacts.Schema` which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. - exclude_splits: JSON-serialized list of names of splits that the example validator should not validate. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError( 'exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_artifact = artifact_utils.get_single_instance( input_dict[STATISTICS_KEY]) for split in artifact_utils.decode_split_names( stats_artifact.split_names): if split in exclude_splits: continue logging.info( 'Validating schema against the computed statistics for ' 'split %s.', split) label_inputs = { labels.STATS: tfdv.load_statistics( io_utils.get_only_uri_in_dir( os.path.join(stats_artifact.uri, split))), labels.SCHEMA: schema } output_uri = artifact_utils.get_split_uri( output_dict[ANOMALIES_KEY], split) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) logging.info( 'Validation complete for split %s. Anomalies written to ' '%s.', split, output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - eval_config: JSON string of tfma.EvalConfig. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Deprecated, use eval_config.slicing_specs instead. Returns: None """ if constants.EXAMPLES_KEY not in input_dict: raise ValueError('EXAMPLES_KEY is missing from input dict.') if constants.MODEL_KEY not in input_dict: raise ValueError('MODEL_KEY is missing from input dict.') if constants.EVALUATION_KEY not in output_dict: raise ValueError('EVALUATION_KEY is missing from output dict.') if len(input_dict[constants.MODEL_KEY]) > 1: raise ValueError( 'There can be only one candidate model, there are {}.'.format( len(input_dict[constants.MODEL_KEY]))) if constants.BASELINE_MODEL_KEY in input_dict and len( input_dict[constants.BASELINE_MODEL_KEY]) > 1: raise ValueError( 'There can be only one baseline model, there are {}.'.format( len(input_dict[constants.BASELINE_MODEL_KEY]))) self._log_startup(input_dict, output_dict, exec_properties) # Add fairness indicator metric callback if necessary. fairness_indicator_thresholds = exec_properties.get( 'fairness_indicator_thresholds', None) add_metrics_callbacks = None if fairness_indicator_thresholds: # Need to import the following module so that the fairness indicator # post-export metric is registered. import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators # pylint: disable=g-import-not-at-top, unused-variable add_metrics_callbacks = [ tfma.post_export_metrics.fairness_indicators( # pytype: disable=module-attr thresholds=fairness_indicator_thresholds), ] output_uri = artifact_utils.get_single_uri( output_dict[constants.EVALUATION_KEY]) run_validation = False models = [] if 'eval_config' in exec_properties and exec_properties['eval_config']: slice_spec = None has_baseline = bool(input_dict.get(constants.BASELINE_MODEL_KEY)) eval_config = tfma.EvalConfig() json_format.Parse(exec_properties['eval_config'], eval_config) eval_config = tfma.update_eval_config_with_defaults( eval_config, maybe_add_baseline=has_baseline, maybe_remove_baseline=not has_baseline) tfma.verify_eval_config(eval_config) # Do not validate model when there is no thresholds configured. This is to # avoid accidentally blessing models when users forget to set thresholds. run_validation = bool( tfma.metrics.metric_thresholds_from_metrics_specs( eval_config.metrics_specs)) if len(eval_config.model_specs) > 2: raise ValueError( """Cannot support more than two models. There are {} models in this eval_config.""".format(len(eval_config.model_specs))) # Extract model artifacts. for model_spec in eval_config.model_specs: if model_spec.is_baseline: model_uri = artifact_utils.get_single_uri( input_dict[constants.BASELINE_MODEL_KEY]) else: model_uri = artifact_utils.get_single_uri( input_dict[constants.MODEL_KEY]) if tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR: model_path = path_utils.eval_model_path(model_uri) else: model_path = path_utils.serving_model_path(model_uri) absl.logging.info('Using {} as {} model.'.format( model_path, model_spec.name)) models.append( tfma.default_eval_shared_model( model_name=model_spec.name, eval_saved_model_path=model_path, add_metrics_callbacks=add_metrics_callbacks, eval_config=eval_config)) else: eval_config = None assert ('feature_slicing_spec' in exec_properties and exec_properties['feature_slicing_spec'] ), 'both eval_config and feature_slicing_spec are unset.' feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) model_uri = artifact_utils.get_single_uri( input_dict[constants.MODEL_KEY]) model_path = path_utils.eval_model_path(model_uri) absl.logging.info('Using {} for model eval.'.format(model_path)) models.append( tfma.default_eval_shared_model( eval_saved_model_path=model_path, add_metrics_callbacks=add_metrics_callbacks)) file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY], 'eval')) eval_shared_model = models[0] if len(models) == 1 else models schema = None if constants.SCHEMA_KEY in input_dict: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.SCHEMA_KEY]))) absl.logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: # pylint: disable=expression-not-assigned if _USE_TFXIO: tensor_adapter_config = None if tfma.is_batched_input(eval_shared_model, eval_config): tfxio = tf_example_record.TFExampleRecord( file_pattern=file_pattern, schema=schema, raw_record_column_name=tfma.BATCHED_INPUT_KEY) if schema is not None: tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfxio.ArrowSchema(), tensor_representations=tfxio.TensorRepresentations( )) data = pipeline | 'ReadFromTFRecordToArrow' >> tfxio.BeamSource( ) else: data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=file_pattern) (data | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models[0] if len(models) == 1 else models, eval_config=eval_config, output_path=output_uri, slice_spec=slice_spec, tensor_adapter_config=tensor_adapter_config)) else: data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=file_pattern) (data | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models[0] if len(models) == 1 else models, eval_config=eval_config, output_path=output_uri, slice_spec=slice_spec)) absl.logging.info( 'Evaluation complete. Results written to {}.'.format(output_uri)) if not run_validation: # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported. absl.logging.info( 'No threshold configured, will not validate model.') return # Set up blessing artifact blessing = artifact_utils.get_single_instance( output_dict[constants.BLESSING_KEY]) blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY, artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY])) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, input_dict[constants.MODEL_KEY][0].id) if input_dict.get(constants.BASELINE_MODEL_KEY): baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0] blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY, baseline_model.uri) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id) if 'current_component_id' in exec_properties: blessing.set_string_custom_property( 'component_id', exec_properties['current_component_id']) # Check validation result and write BLESSED file accordingly. absl.logging.info('Checking validation results.') validation_result = tfma.load_validation_result(output_uri) if validation_result.validation_ok: io_utils.write_string_file( os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '') blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.BLESSED_VALUE) else: io_utils.write_string_file( os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '') blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.NOT_BLESSED_VALUE) absl.logging.info('Blessing result {} written to {}.'.format( validation_result.validation_ok, blessing.uri))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Main execution logic for the Sequencer component :param input_dict: input channels :param output_dict: output channels :param exec_properties: the execution properties defined in the spec """ source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) # Get the schema schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA])) schema = io_utils.SchemaReader().read(schema_path) # TODO: Getting the statistics might help the future implementations sequence_step: BaseSequencerStep = c(schema=schema, statistics=None, **args) # Get split names input_artifact = artifact_utils.get_single_instance( input_dict[constants.INPUT_EXAMPLES]) split_names = artifact_utils.decode_split_names( input_artifact.split_names) # Create output artifact output_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) output_artifact.split_names = artifact_utils.encode_split_names( split_names) with self._make_beam_pipeline() as p: for s in split_names: input_uri = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.INPUT_EXAMPLES], s)) output_uri = artifact_utils.get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], s) output_path = os.path.join(output_uri, self._DEFAULT_FILENAME) # Read and decode the data data = \ (p | 'Read_' + s >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | 'Decode_' + s >> tf_example_decoder.DecodeTFExample() | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe())) # Window into sessions s_data = \ (data | 'AddCategory_' + s >> beam.ParDo( sequence_step.get_category_do_fn()) | 'AddTimestamp_' + s >> beam.ParDo( sequence_step.get_timestamp_do_fn()) | 'Sessions_' + s >> beam.WindowInto( sequence_step.get_window())) # Combine and transform p_data = \ (s_data | 'Combine_' + s >> beam.CombinePerKey( sequence_step.get_combine_fn())) # Write the results _ = \ (p_data | 'Global_' + s >> beam.WindowInto(GlobalWindows()) | 'RemoveKey_' + s >> beam.ParDo(RemoveKey()) | 'ToExample_' + s >> beam.Map(utils.df_to_example) | 'Serialize_' + s >> beam.Map(utils.serialize) | 'Write_' + s >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz'))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs a batch job to evaluate the eval_model against the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - model_exports: exported model. - examples: examples for eval the model. output_dict: Output dict from output key to a list of Artifacts. - output: model evaluation results. exec_properties: A dict of execution properties. - eval_config: JSON string of tfma.EvalConfig. - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec instance, providing the way to slice the data. Deprecated, use eval_config.slicing_specs instead. - example_splits: JSON-serialized list of names of splits on which the metrics are computed. Default behavior (when example_splits is set to None) is using the 'eval' split. Returns: None """ if constants.EXAMPLES_KEY not in input_dict: raise ValueError('EXAMPLES_KEY is missing from input dict.') if constants.MODEL_KEY not in input_dict: raise ValueError('MODEL_KEY is missing from input dict.') if constants.EVALUATION_KEY not in output_dict: raise ValueError('EVALUATION_KEY is missing from output dict.') if len(input_dict[constants.MODEL_KEY]) > 1: raise ValueError( 'There can be only one candidate model, there are %d.' % (len(input_dict[constants.MODEL_KEY]))) if constants.BASELINE_MODEL_KEY in input_dict and len( input_dict[constants.BASELINE_MODEL_KEY]) > 1: raise ValueError( 'There can be only one baseline model, there are %d.' % (len(input_dict[constants.BASELINE_MODEL_KEY]))) self._log_startup(input_dict, output_dict, exec_properties) # Add fairness indicator metric callback if necessary. fairness_indicator_thresholds = exec_properties.get( 'fairness_indicator_thresholds', None) add_metrics_callbacks = None if fairness_indicator_thresholds: add_metrics_callbacks = [ tfma.post_export_metrics.fairness_indicators( # pytype: disable=module-attr thresholds=fairness_indicator_thresholds), ] output_uri = artifact_utils.get_single_uri( output_dict[constants.EVALUATION_KEY]) eval_shared_model_fn = udf_utils.try_get_fn( exec_properties=exec_properties, fn_name='custom_eval_shared_model' ) or tfma.default_eval_shared_model run_validation = False models = [] if 'eval_config' in exec_properties and exec_properties['eval_config']: slice_spec = None has_baseline = bool(input_dict.get(constants.BASELINE_MODEL_KEY)) eval_config = tfma.EvalConfig() json_format.Parse(exec_properties['eval_config'], eval_config) eval_config = tfma.update_eval_config_with_defaults( eval_config, maybe_add_baseline=has_baseline, maybe_remove_baseline=not has_baseline) tfma.verify_eval_config(eval_config) # Do not validate model when there is no thresholds configured. This is to # avoid accidentally blessing models when users forget to set thresholds. run_validation = bool( tfma.metrics.metric_thresholds_from_metrics_specs( eval_config.metrics_specs)) if len(eval_config.model_specs) > 2: raise ValueError( """Cannot support more than two models. There are %d models in this eval_config.""" % (len(eval_config.model_specs))) # Extract model artifacts. for model_spec in eval_config.model_specs: if model_spec.is_baseline: model_uri = artifact_utils.get_single_uri( input_dict[constants.BASELINE_MODEL_KEY]) else: model_uri = artifact_utils.get_single_uri( input_dict[constants.MODEL_KEY]) if tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR: model_path = path_utils.eval_model_path(model_uri) else: model_path = path_utils.serving_model_path(model_uri) logging.info('Using %s as %s model.', model_path, model_spec.name) models.append( eval_shared_model_fn( eval_saved_model_path=model_path, model_name=model_spec.name, eval_config=eval_config, add_metrics_callbacks=add_metrics_callbacks)) else: eval_config = None assert ('feature_slicing_spec' in exec_properties and exec_properties['feature_slicing_spec'] ), 'both eval_config and feature_slicing_spec are unset.' feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() json_format.Parse(exec_properties['feature_slicing_spec'], feature_slicing_spec) slice_spec = self._get_slice_spec_from_feature_slicing_spec( feature_slicing_spec) model_uri = artifact_utils.get_single_uri( input_dict[constants.MODEL_KEY]) model_path = path_utils.eval_model_path(model_uri) logging.info('Using %s for model eval.', model_path) models.append( eval_shared_model_fn( eval_saved_model_path=model_path, model_name='', eval_config=None, add_metrics_callbacks=add_metrics_callbacks)) eval_shared_model = models[0] if len(models) == 1 else models schema = None if constants.SCHEMA_KEY in input_dict: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.SCHEMA_KEY]))) # Load and deserialize example splits from execution properties. example_splits = json_utils.loads( exec_properties.get(constants.EXAMPLE_SPLITS_KEY, 'null')) if not example_splits: example_splits = ['eval'] logging.info( "The 'example_splits' parameter is not set, using 'eval' " 'split.') logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: examples_list = [] tensor_adapter_config = None # pylint: disable=expression-not-assigned if _USE_TFXIO and tfma.is_batched_input(eval_shared_model, eval_config): tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[ artifact_utils.get_single_instance( input_dict[constants.EXAMPLES_KEY]) ], telemetry_descriptors=_TELEMETRY_DESCRIPTORS, schema=schema, raw_record_column_name=tfma_constants.ARROW_INPUT_COLUMN) # TODO(b/161935932): refactor after TFXIO supports multiple patterns. for split in example_splits: file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.EXAMPLES_KEY], split)) tfxio = tfxio_factory(file_pattern) data = (pipeline | 'ReadFromTFRecordToArrow[%s]' % split >> tfxio.BeamSource()) examples_list.append(data) if schema is not None: # Use last tfxio as TensorRepresentations and ArrowSchema are fixed. tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfxio.ArrowSchema(), tensor_representations=tfxio.TensorRepresentations()) else: for split in example_splits: file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.EXAMPLES_KEY], split)) data = ( pipeline | 'ReadFromTFRecord[%s]' % split >> beam.io.ReadFromTFRecord(file_pattern=file_pattern)) examples_list.append(data) custom_extractors = udf_utils.try_get_fn( exec_properties=exec_properties, fn_name='custom_extractors') extractors = None if custom_extractors: extractors = custom_extractors( eval_shared_model=eval_shared_model, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config) (examples_list | 'FlattenExamples' >> beam.Flatten() | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=models[0] if len(models) == 1 else models, eval_config=eval_config, extractors=extractors, output_path=output_uri, slice_spec=slice_spec, tensor_adapter_config=tensor_adapter_config)) logging.info('Evaluation complete. Results written to %s.', output_uri) if not run_validation: # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported. logging.info('No threshold configured, will not validate model.') return # Set up blessing artifact blessing = artifact_utils.get_single_instance( output_dict[constants.BLESSING_KEY]) blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY, artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY])) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY, input_dict[constants.MODEL_KEY][0].id) if input_dict.get(constants.BASELINE_MODEL_KEY): baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0] blessing.set_string_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY, baseline_model.uri) blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY, baseline_model.id) if 'current_component_id' in exec_properties: blessing.set_string_custom_property( 'component_id', exec_properties['current_component_id']) # Check validation result and write BLESSED file accordingly. logging.info('Checking validation results.') validation_result = tfma.load_validation_result(output_uri) if validation_result.validation_ok: io_utils.write_string_file( os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '') blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.BLESSED_VALUE) else: io_utils.write_string_file( os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME), '') blessing.set_int_custom_property( constants.ARTIFACT_PROPERTY_BLESSED_KEY, constants.NOT_BLESSED_VALUE) logging.info('Blessing result %s written to %s.', validation_result.validation_ok, blessing.uri)