def compare_stats_for_examples(self, examples_id, other_examples_id, name='', other_name=''): """Compares stats for `examples_id` and `other_examples_id`. Args: examples_id: A `int` indicating the id of one `TFXArtifactTypes.EXAMPLES` artifact. other_examples_id: A `int` indicating the id of another `TFXArtifactTypes.EXAMPLES` artifact. name: (Optional) A `str` indicating the label to use for stats of `examples_id`. other_name: (Optional) A `str` indicating the label to use for stats of `other_examples_id`. """ stats_artifact, other_stats_artifact = ( self.get_dest_artifact_of_type(examples_id, TFXArtifactTypes.EXAMPLE_STATS), self.get_dest_artifact_of_type(other_examples_id, TFXArtifactTypes.EXAMPLE_STATS)) if stats_artifact and other_stats_artifact: tfdv.visualize_statistics(tfdv.load_statistics(stats_artifact.uri), rhs_statistics=tfdv.load_statistics( other_stats_artifact.uri), lhs_name=name, rhs_name=other_name)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. - exclude_splits: Names of splits that will not be taken into consideration when auto-generating a schema. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. infer_feature_shape = exec_properties.get(INFER_FEATURE_SHAPE_KEY) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Only one schema is generated for all splits. schema = None stats_artifact = artifact_utils.get_single_instance( input_dict[STATISTICS_KEY]) for split in artifact_utils.decode_split_names(stats_artifact.split_names): if split in exclude_splits: continue logging.info('Processing schema from statistics for split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( os.path.join(stats_artifact.uri, split)) if not schema: schema = tfdv.infer_schema( tfdv.load_statistics(stats_uri), infer_feature_shape) else: schema = tfdv.update_schema(schema, tfdv.load_statistics(stats_uri), infer_feature_shape) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict[SCHEMA_KEY]), _DEFAULT_FILE_NAME) io_utils.write_pbtxt_file(output_uri, schema) logging.info('Schema written to %s.', output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'stats': A list of 'ExampleStatistics' type which must contain split 'train'. Stats on other splits are ignored. - 'statistics': Synonym for 'stats'. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(input_dict['stats'], 'train')) output_uri = os.path.join( artifact_utils.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) infer_feature_shape = exec_properties['infer_feature_shape'] absl.logging.info('Infering schema from statistics.') schema = tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape) io_utils.write_pbtxt_file(output_uri, schema) absl.logging.info('Schema written to %s.' % output_uri)
def _validate_stats_output(self, stats_path): self.assertTrue(tf.io.gfile.exists(stats_path)) stats = tfdv.load_statistics(stats_path) self.assertLen(stats.datasets, 1) data_set = stats.datasets[0] self.assertGreater(data_set.num_examples, 0) self.assertNotEmpty(data_set.features)
def test_e2e(self, stats_options, expected_stats_pbtxt, expected_schema_pbtxt): tfxio = tf_sequence_example_record.TFSequenceExampleRecord( self._input_file, ['tfdv', 'test']) stats_file = os.path.join(self._output_dir, 'stats') with beam.Pipeline() as p: _ = (p | 'TFXIORead' >> tfxio.BeamSource() | 'GenerateStats' >> tfdv.GenerateStatistics(stats_options) | 'WriteStats' >> tfdv.WriteStatisticsToTFRecord(stats_file)) actual_stats = tfdv.load_statistics(stats_file) test_util.make_dataset_feature_stats_list_proto_equal_fn( self, text_format.Parse( expected_stats_pbtxt, statistics_pb2.DatasetFeatureStatisticsList()))([actual_stats]) actual_schema = tfdv.infer_schema(actual_stats, infer_feature_shape=True) if hasattr(actual_schema, 'generate_legacy_feature_spec'): actual_schema.ClearField('generate_legacy_feature_spec') self._assert_schema_equal( actual_schema, text_format.Parse(expected_schema_pbtxt, schema_pb2.Schema()))
def _provide_schema(self, input_dict, exec_properties) -> schema_pb2.Schema: """Generates schema from either schema or statistics.""" # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. stats = input_dict.get('stats') or input_dict.get('statistics') schema = input_dict.get('schema') if bool(stats) == bool(schema): raise ValueError( 'Exactly only one of schema or stats must be provided') if schema: schema_uri = artifact_utils.get_single_uri(schema) absl.logging.info('Schema is provided. Reading from %s.' % schema_uri) schema_reader = io_utils.SchemaReader() try: return schema_reader.read( os.path.join(schema_uri, _DEFAULT_FILE_NAME)) except tf.errors.NotFoundError: raise ValueError( 'Schema is provided, but failed to read from %s.' % schema_uri) train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(stats, 'train')) infer_feature_shape = exec_properties['infer_feature_shape'] return tfdv.infer_schema(tfdv.load_statistics(train_stats_uri), infer_feature_shape)
def _validate_stats_output(self, stats_path): self.assertTrue(tf.gfile.Exists(stats_path)) stats = tfdv.load_statistics(stats_path) self.assertEqual(1, len(stats.datasets)) data_set = stats.datasets[0] self.assertGreater(data_set.num_examples, 0) self.assertNotEqual(0, len(data_set.features))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: self._log_startup(input_dict, output_dict, exec_properties) logging.info('Validating schema against the computed statistics.') split_uris: List[Text] = [] for artifact in input_dict[executor.STATISTICS_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): split_uris.append(split) label_inputs = { labels.STATS: tfdv.load_statistics( io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri( input_dict[executor.STATISTICS_KEY], split_uris[0]))), labels.SCHEMA: io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[executor.SCHEMA_KEY]))) } output_uri = artifact_utils.get_single_uri( output_dict[executor.ANOMALIES_KEY]) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'train'. Stats on other splits are ignored. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'SchemaPath' artifact of size one. exec_properties: A dict of execution properties. Not used yet. Returns: None """ # TODO(zhitaoli): Move constants between this file and component.py to a # constants.py. train_stats_uri = io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'train')) output_uri = os.path.join( types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME) infer_feature_shape = False tf.logging.info('Infering schema from statistics.') schema = tfdv.infer_schema( tfdv.load_statistics(train_stats_uri), infer_feature_shape) io_utils.write_pbtxt_file(output_uri, schema) tf.logging.info('Schema written to {}.'.format(output_uri))
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def parse_statistics(split_name: Text, statistics: List[Artifact]) -> Dict[Text, int]: stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(statistics, split_name)) stats = tfdv.load_statistics(stats_uri) return stats
def display_stats_for_examples(self, examples_id): """Displays stats for `examples_id`. Args: examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES` artifact. """ stats_artifact = self.get_dest_artifact_of_type( examples_id, TFXArtifactTypes.EXAMPLE_STATS) if stats_artifact: tfdv.visualize_statistics( tfdv.load_statistics( os.path.join(stats_artifact.uri, 'stats_tfrecord'))) print("display shema") tfdv.display_schema( tfdv.infer_schema(statistics=tfdv.load_statistics( os.path.join(stats_artifact.uri, 'stats_tfrecord'))))
def display(self, artifact: types.Artifact): from IPython.core.display import display # pylint: disable=g-import-not-at-top from IPython.core.display import HTML # pylint: disable=g-import-not-at-top for split in artifact_utils.decode_split_names(artifact.split_names): display(HTML('<div><b>%r split:</b></div><br/>' % split)) stats_path = os.path.join(artifact.uri, split, 'stats_tfrecord') stats = tfdv.load_statistics(stats_path) tfdv.visualize_statistics(stats)
def validate_stats_against_schema(self): # type: () -> bool stats = tfdv.load_statistics(self.stats_path) self.anomalies = tfdv.validate_statistics(stats, self.schema) if len(self.anomalies.anomaly_info.items()) > 0: logger.error("Anomalies found in training dataset...") logger.error(str(self.anomalies.anomaly_info.items())) self.upload_anomalies() return False else: logger.info("No anomalies found") return True
def parse_schema_from_stats(cls, stats_path): # type: (str) -> Tuple[Dict[str, Union[tf.FixedLenFeature, tf.VarLenFeature, tf.SparseFeature]], Schema] # noqa: E501 """ Returns TensorFlow Feature Spec and parsed tf.metadata Schema for given tf.metadata DatasetFeatureStatisticsList. :param stats_path: tf.metadata DatasetFeatureStatisticsList path """ import tensorflow_data_validation as tfdv stats = tfdv.load_statistics(stats_path) schema = tfdv.infer_schema(stats) return schema_to_feature_spec(schema), schema
def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ print('Infering schema from statistics.') schema = tfdv.infer_schema(tfdv.load_statistics(stats_path), infer_feature_shape=False) print(text_format.MessageToString(schema)) print('## Statistics ##') stats = tfdv.load_statistics(stats_path) # print(text_format.MessageToString(stats)) for d in stats.datasets: for f in d.features: print(f.name) print('Writing schema to output path.') file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
def display(self, artifact: types.Artifact): from IPython.core.display import display # pylint: disable=g-import-not-at-top from IPython.core.display import HTML # pylint: disable=g-import-not-at-top for split in artifact_utils.decode_split_names(artifact.split_names): display(HTML('<div><b>%r split:</b></div><br/>' % split)) stats_path = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri([artifact], split)) if artifact_utils.is_artifact_version_older_than( artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE): # pylint: disable=protected-access stats = tfdv.load_statistics(stats_path) else: stats = tfdv.load_stats_binary(stats_path) tfdv.visualize_statistics(stats)
def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ print('Infering schema from statistics.') schema = tfdv.infer_schema(tfdv.load_statistics(stats_path), infer_feature_shape=False) print(text_format.MessageToString(schema)) print('Writing schema to output path.') tfdv.write_schema_text(schema, schema_path)
def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ # Infering schema from statistics schema = tfdv.infer_schema(tfdv.load_statistics(stats_path), infer_feature_shape=False) # Writing schema to output path file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
def display_stats_for_examples(self, examples_id, split='train'): """Displays stats for `examples_id`. Args: examples_id: A `int` indicating the id of a `TFXArtifactTypes.EXAMPLES` artifact. split: A `string` specifying the split name, by default 'train' is used. """ stats_artifact = self.get_dest_artifact_of_type( examples_id, TFXArtifactTypes.EXAMPLE_STATS) if stats_artifact: tfdv.visualize_statistics( tfdv.load_statistics( os.path.join(stats_artifact.uri, split, 'stats_tfrecord')))
def infer_schema(stats_path, schema_path): """Infers a schema from stats in stats_path. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location where the inferred schema is materialized. """ print('Infering schema from statistics.') schema = tfdv.infer_schema( tfdv.load_statistics(stats_path), infer_feature_shape=False) print(text_format.MessageToString(schema)) print('Writing schema to output path.') file_io.write_string_to_file(schema_path, text_format.MessageToString(schema))
def tfdv_detect_drift( stats_older_path: str, stats_new_path: str ) -> NamedTuple('Outputs', [('drift', str)]): import logging import time import tensorflow_data_validation as tfdv import tensorflow_data_validation.statistics.stats_impl logging.getLogger().setLevel(logging.INFO) logging.info('stats_older_path: %s', stats_older_path) logging.info('stats_new_path: %s', stats_new_path) if stats_older_path == 'none': return ('true', ) stats1 = tfdv.load_statistics(stats_older_path) stats2 = tfdv.load_statistics(stats_new_path) schema1 = tfdv.infer_schema(statistics=stats1) tfdv.get_feature(schema1, 'duration').drift_comparator.jensen_shannon_divergence.threshold = 0.01 drift_anomalies = tfdv.validate_statistics( statistics=stats2, schema=schema1, previous_statistics=stats1) logging.info('drift analysis results: %s', drift_anomalies.drift_skew_info) from google.protobuf.json_format import MessageToDict d = MessageToDict(drift_anomalies) val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value'] thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold'] logging.info('value %s and threshold %s', val, thresh) res = 'true' if val < thresh: res = 'false' logging.info('train decision: %s', res) return (res, )
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ # Validating schema against the computed statistics schema = my_metadata.read_schema(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) # Writing anomalies to anomalies path to file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ print('Validating schema against the computed statistics.') schema = tfdv.load_schema_text(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) print('Detected following anomalies:') print(text_format.MessageToString(anomalies)) print('Writing anomalies to anomalies path.') file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def validate_stats(stats_path, schema_path, anomalies_path): """Validates the statistics against the schema and materializes anomalies. Args: stats_path: Location of the stats used to infer the schema. schema_path: Location of the schema to be used for validation. anomalies_path: Location where the detected anomalies are materialized. """ print('Validating schema against the computed statistics.') schema = taxi.read_schema(schema_path) stats = tfdv.load_statistics(stats_path) anomalies = tfdv.validate_statistics(stats, schema) print('Detected following anomalies:') print(text_format.MessageToString(anomalies)) print('Writing anomalies to anomalies path.') file_io.write_string_to_file(anomalies_path, text_format.MessageToString(anomalies))
def group_stats_and_examples( input_dict: Dict[Text, List[types.Artifact]] ) -> List[Tuple[types.Artifact, Dict[Text, DatasetFeatureStatistics]]]: result = [] examples_list = input_dict[EXAMPLES_KEY] if len(examples_list) > 1: raise ValueError('only one examples artifact expected') examples = examples_list[0] group = {} for split in artifact_utils.decode_split_names(examples.split_names): statistics = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.artifact_utils.get_split_uri(input_dict[STATISTICS_KEY], split))) if len(statistics.datasets) != 1: raise ValueError('one statistics set expected') group[split] = statistics.datasets[0] result.append((examples, group)) return result
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of type `standard_artifacts.ExampleStatistics` which should contain the 'eval' split. Stats on other splits are ignored. - schema: A list of type `standard_artifacts.Schema` which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) absl.logging.info('Validating schema against the computed statistics.') label_inputs = { labels.STATS: tfdv.load_statistics( io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(input_dict[STATISTICS_KEY], 'eval'))), labels.SCHEMA: io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) } output_uri = artifact_utils.get_single_uri(output_dict[ANOMALIES_KEY]) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) absl.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def validate_stats_against_schema(self, environment=None, previous_statistics=None, serving_statistics=None, ): # type: (str, DatasetFeatureStatisticsList, DatasetFeatureStatisticsList) -> bool stats = tfdv.load_statistics(self.stats_path) self.anomalies = tfdv.validate_statistics( stats, self.schema, environment=environment, previous_statistics=previous_statistics, serving_statistics=serving_statistics, ) if len(self.anomalies.anomaly_info.items()) > 0: logger.error("Anomalies found in training dataset...") logger.error(str(self.anomalies.anomaly_info.items())) self.upload_anomalies() return False else: logger.info("No anomalies found") return True
def Do(self, input_dict, output_dict, exec_properties): """TensorFlow ExampleValidator executor entrypoint. This validates the statistics on the 'eval' split against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - stats: A list of 'ExampleStatisticsPath' type which should contain split 'eval'. Stats on other splits are ignored. - schema: A list of 'SchemaPath' type which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'ExampleValidationPath' artifact of size one. It will include a single pbtxt file which contains all anomalies found. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) tf.logging.info('Validating schema against the computed statistics.') schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema']))) stats = tfdv.load_statistics( io_utils.get_only_uri_in_dir( types.get_split_uri(input_dict['stats'], 'eval'))) output_uri = types.get_single_uri(output_dict['output']) anomalies = tfdv.validate_statistics(stats, schema) io_utils.write_pbtxt_file( os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies) tf.logging.info( 'Validation complete. Anomalies written to {}.'.format(output_uri))
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """TensorFlow SchemaGen executor entrypoint. This infers the schema using tensorflow_data_validation on the precomputed stats of 'train' split. Args: input_dict: Input dict from input key to a list of artifacts, including: - 'statistics': A list of 'ExampleStatistics' type which must contain split 'train'. output_dict: Output dict from key to a list of artifacts, including: - schema: A list of 'Schema' artifact of size one. exec_properties: A dict of execution properties, includes: - infer_feature_shape: Whether or not to infer the shape of the feature. - exclude_splits: Names of splits that will not be taken into consideration when auto-generating a schema. Returns: None """ infer_feature_shape = bool( exec_properties.get( standard_component_specs.INFER_FEATURE_SHAPE_KEY, True)) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError( 'exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Only one schema is generated for all splits. schema = None stats_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.STATISTICS_KEY]) for split in artifact_utils.decode_split_names( stats_artifact.split_names): if split in exclude_splits: continue logging.info('Processing schema from statistics for split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri([stats_artifact], split)) if artifact_utils.is_artifact_version_older_than( stats_artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE): # pylint: disable=protected-access stats = tfdv.load_statistics(stats_uri) else: stats = tfdv.load_stats_binary(stats_uri) if not schema: schema = tfdv.infer_schema(stats, infer_feature_shape) else: schema = tfdv.update_schema(schema, stats, infer_feature_shape) output_uri = os.path.join( artifact_utils.get_single_uri( output_dict[standard_component_specs.SCHEMA_KEY]), DEFAULT_FILE_NAME) io_utils.write_pbtxt_file(output_uri, schema) logging.info('Schema written to %s.', output_uri)
def Do(self, input_dict: Dict[Text, List[Artifact]], output_dict: Dict[Text, List[Artifact]], exec_properties: Dict[Text, Any]) -> None: """Generate MetaFeatures for meta training datasets. Args: input_dict: Input dict from input key to a list of artifacts, including: - statistics: output from StatisticsGen component. output_dict: Output dict from key to a list of artifacts. exec_properties: A dict of execution properties """ train_stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri(input_dict[STATISTICS_KEY], 'train')) stats = tfdv.load_statistics(train_stats_uri) if len(stats.datasets) != 1: raise ValueError( 'DatasetFeatureStatisticsList proto contains multiple datasets. Only ' 'one dataset is currently supported.') stats = stats.datasets[0] num_float_features = 0 num_int_features = 0 num_categorical_features = 0 for feature in stats.features: name = feature.name # For structured fields, name is set by path and is not in the name # attribute. if not name: name = feature.path.step[0] logging.info('Feature Name: %s', name) if statistics_pb2.FeatureNameStatistics.FLOAT == feature.type: num_float_features += 1 elif statistics_pb2.FeatureNameStatistics.INT == feature.type: num_int_features += 1 else: num_categorical_features += 1 metafeature_dict = { 'num_examples': stats.num_examples, 'num_int_features': num_int_features, 'num_float_features': num_float_features, 'num_categorical_features': num_categorical_features, } metafeature_dict['metafeature'] = [ stats.num_examples, num_int_features, num_float_features, num_categorical_features ] metafeature_path = os.path.join( artifact_utils.get_single_uri(output_dict[METAFEATURES_KEY]), artifacts.MetaFeatures.DEFAULT_FILE_NAME) io_utils.write_string_file(metafeature_path, json.dumps(metafeature_dict)) logging.info('MetaFeature saved at %s', metafeature_path)
def display(self, artifact: types.Artifact): stats_path = os.path.join(artifact.uri, 'stats_tfrecord') stats = tfdv.load_statistics(stats_path) tfdv.visualize_statistics(stats)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """TensorFlow ExampleValidator executor entrypoint. This validates statistics against the schema. Args: input_dict: Input dict from input key to a list of artifacts, including: - statistics: A list of type `standard_artifacts.ExampleStatistics` generated by StatisticsGen. - schema: A list of type `standard_artifacts.Schema` which should contain a single schema artifact. output_dict: Output dict from key to a list of artifacts, including: - output: A list of 'standard_artifacts.ExampleAnomalies' of size one. It will include a single binary proto file which contains all anomalies found. exec_properties: A dict of execution properties. - exclude_splits: JSON-serialized list of names of splits that the example validator should not validate. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError( 'exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) # Setup output splits. stats_artifact = artifact_utils.get_single_instance( input_dict[standard_component_specs.STATISTICS_KEY]) stats_split_names = artifact_utils.decode_split_names( stats_artifact.split_names) split_names = [ split for split in stats_split_names if split not in exclude_splits ] anomalies_artifact = artifact_utils.get_single_instance( output_dict[standard_component_specs.ANOMALIES_KEY]) anomalies_artifact.split_names = artifact_utils.encode_split_names( split_names) schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.SCHEMA_KEY]))) for split in artifact_utils.decode_split_names( stats_artifact.split_names): if split in exclude_splits: continue logging.info( 'Validating schema against the computed statistics for ' 'split %s.', split) stats_uri = io_utils.get_only_uri_in_dir( artifact_utils.get_split_uri([stats_artifact], split)) if artifact_utils.is_artifact_version_older_than( stats_artifact, artifact_utils._ARTIFACT_VERSION_FOR_STATS_UPDATE): # pylint: disable=protected-access stats = tfdv.load_statistics(stats_uri) else: stats = tfdv.load_stats_binary(stats_uri) label_inputs = { standard_component_specs.STATISTICS_KEY: stats, standard_component_specs.SCHEMA_KEY: schema } output_uri = artifact_utils.get_split_uri( output_dict[standard_component_specs.ANOMALIES_KEY], split) label_outputs = {labels.SCHEMA_DIFF_PATH: output_uri} self._Validate(label_inputs, label_outputs) logging.info( 'Validation complete for split %s. Anomalies written to ' '%s.', split, output_uri)