def generate_statistics_from_tfrecord( data_location: Text, output_path: Optional[bytes] = None, stats_options: options.StatsOptions = options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Compute data statistics from TFRecord files containing TFExamples. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) batch_size = stats_options.desired_batch_size # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # Auto detect tfrecord file compression format based on input data # path suffix. _ = ( p | 'ReadData' >> (tf_example_record.TFExampleRecord( file_pattern=data_location, schema=None, telemetry_descriptors=['tfdv', 'generate_statistics_from_tfrecord']) .BeamSource(batch_size)) | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput' >> (stats_api.WriteStatisticsToTFRecord(output_path))) return stats_util.load_statistics(output_path)
def test_write_stats_to_tfrecrod(self): stats = text_format.Parse( """ datasets { name: 'x' num_examples: 100 } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path = os.path.join(self._get_temp_dir(), 'stats') with beam.Pipeline() as p: _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToTFRecord(output_path)) stats_from_file = stats_util.load_statistics(output_path) self.assertLen(stats_from_file.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_file.datasets[0], stats.datasets[0])
def generate_statistics_from_csv( data_location: Text, column_names: Optional[List[types.FeatureName]] = None, delimiter: Text = ',', output_path: Optional[bytes] = None, stats_options: options.StatsOptions = options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, compression_type: Text = CompressionTypes.AUTO, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Compute data statistics from CSV files. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in CSV format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. column_names: A list of column names to be treated as the CSV header. Order must match the order in the input CSV files. If this argument is not specified, we assume the first line in the input CSV files as the header. Note that this option is valid only for 'csv' input file format. delimiter: A one-character string used to separate fields in a CSV file. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. compression_type: Used to handle compressed input files. Default value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # If a header is not provided, assume the first line in a file # to be the header. skip_header_lines = 1 if column_names is None else 0 if column_names is None: column_names = get_csv_header(data_location, delimiter) _ = ( p | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=data_location, skip_header_lines=skip_header_lines, compression_type=compression_type) | 'DecodeData' >> csv_decoder.DecodeCSV( column_names=column_names, delimiter=delimiter, schema=stats_options.schema if stats_options.infer_type_from_schema else None, desired_batch_size=batch_size) | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(output_path)) return stats_util.load_statistics(output_path)
def validate_examples_in_tfrecord( data_location: Text, stats_options: options.StatsOptions, output_path: Optional[Text] = None, # TODO(b/131719250): Add option to output a sample of anomalous examples for # each anomaly reason. pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Validates TFExamples in TFRecord files. Runs a Beam pipeline to detect anomalies on a per-example basis. If this function detects anomalous examples, it generates summary statistics regarding the set of examples that exhibit each anomaly. This is a convenience function for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples' PTransform API directly instead. Args: data_location: The location of the input data files. stats_options: `tfdv.StatsOptions` for generating data statistics. This must contain a schema. output_path: The file path to output data statistics result to. If None, the function uses a temporary directory. The output will be a TFRecord file containing a single data statistics list proto, and can be read with the 'load_statistics' function. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto in which each dataset consists of the set of examples that exhibit a particular anomaly. Raises: ValueError: If the specified stats_options does not include a schema. """ if stats_options.schema is None: raise ValueError('The specified stats_options must include a schema.') if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'anomaly_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) with beam.Pipeline(options=pipeline_options) as p: _ = (p | 'ReadData' >> (tf_example_record.TFExampleRecord( file_pattern=data_location, schema=None, telemetry_descriptors=[ 'tfdv', 'validate_examples_in_tfrecord' ]).BeamSource(batch_size=1)) | 'DetectAnomalies' >> validation_api.IdentifyAnomalousExamples(stats_options) | 'GenerateSummaryStatistics' >> stats_impl.GenerateSlicedStatisticsImpl(stats_options, is_slicing_enabled=True) | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(output_path)) return stats_util.load_statistics(output_path)
def validate_examples_in_csv( data_location: Text, stats_options: options.StatsOptions, column_names: Optional[List[types.FeatureName]] = None, delimiter: Text = ',', output_path: Optional[Text] = None, # TODO(b/131719250): Add option to output a sample of anomalous examples for # each anomaly reason. pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Validates examples in csv files. Runs a Beam pipeline to detect anomalies on a per-example basis. If this function detects anomalous examples, it generates summary statistics regarding the set of examples that exhibit each anomaly. This is a convenience function for users with data in CSV format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples' PTransform API directly instead. Args: data_location: The location of the input data files. stats_options: `tfdv.StatsOptions` for generating data statistics. This must contain a schema. column_names: A list of column names to be treated as the CSV header. Order must match the order in the input CSV files. If this argument is not specified, we assume the first line in the input CSV files as the header. Note that this option is valid only for 'csv' input file format. delimiter: A one-character string used to separate fields in a CSV file. output_path: The file path to output data statistics result to. If None, the function uses a temporary directory. The output will be a TFRecord file containing a single data statistics list proto, and can be read with the 'load_statistics' function. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto in which each dataset consists of the set of examples that exhibit a particular anomaly. Raises: ValueError: If the specified stats_options does not include a schema. """ if stats_options.schema is None: raise ValueError('The specified stats_options must include a schema.') if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'anomaly_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) # If a header is not provided, assume the first line in a file # to be the header. skip_header_lines = 1 if column_names is None else 0 if column_names is None: column_names = stats_gen_lib.get_csv_header(data_location, delimiter) with beam.Pipeline(options=pipeline_options) as p: _ = (p | 'ReadData' >> beam.io.textio.ReadFromText( file_pattern=data_location, skip_header_lines=skip_header_lines) | 'DecodeData' >> csv_decoder.DecodeCSV( column_names=column_names, delimiter=delimiter, schema=stats_options.schema if stats_options.infer_type_from_schema else None, desired_batch_size=1) | 'DetectAnomalies' >> validation_api.IdentifyAnomalousExamples(stats_options) | 'GenerateSummaryStatistics' >> stats_impl.GenerateSlicedStatisticsImpl(stats_options, is_slicing_enabled=True) | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(output_path)) return stats_util.load_statistics(output_path)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. - exclude_splits: JSON-serialized list of names of splits where statistics and sample should not be generated. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() stats_options_json = exec_properties.get(STATS_OPTIONS_JSON_KEY) if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json(stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError('A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema # Load and deserialize exclude splits from execution properties. exclude_splits = json_utils.loads( exec_properties.get(EXCLUDE_SPLITS_KEY, 'null')) or [] if not isinstance(exclude_splits, list): raise ValueError('exclude_splits in execution properties needs to be a ' 'list. Got %s instead.' % type(exclude_splits)) split_and_tfxio = [] examples = artifact_utils.get_single_instance(input_dict[EXAMPLES_KEY]) tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[examples], telemetry_descriptors=_TELEMETRY_DESCRIPTORS) for split in artifact_utils.decode_split_names(examples.split_names): if split in exclude_splits: continue uri = os.path.join(examples.uri, split) split_and_tfxio.append( (split, tfxio_factory(io_utils.all_files_pattern(uri)))) with self._make_beam_pipeline() as p: for split, tfxio in split_and_tfxio: logging.info('Generating statistics for split %s.', split) output_uri = artifact_utils.get_split_uri(output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource() _ = ( data | 'GenerateStatistics[%s]' % split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[%s]' % split >> stats_api.WriteStatisticsToTFRecord(output_path)) logging.info('Statistics for split %s written to %s.', split, output_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. - schema: Optionally, a list of type `standard_artifacts.Schema`. When the stats_options exec_property also contains a schema, this input should not be provided. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. - stats_options_json: Optionally, a JSON representation of StatsOptions. When a schema is provided as an input, the StatsOptions value should not also contain a schema. Raises: ValueError when a schema is provided both as an input and as part of the StatsOptions exec_property. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) stats_options = options.StatsOptions() if STATS_OPTIONS_JSON_KEY in exec_properties: stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY] if stats_options_json: # TODO(b/150802589): Move jsonable interface to tfx_bsl and use # json_utils stats_options = options.StatsOptions.from_json( stats_options_json) if input_dict.get(SCHEMA_KEY): if stats_options.schema: raise ValueError( 'A schema was provided as an input and the ' 'stats_options exec_property also contains a schema ' 'value. At most one of these may be set.') else: schema = io_utils.SchemaReader().read( io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY]))) stats_options.schema = schema split_uris = [] for artifact in input_dict[EXAMPLES_KEY]: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) input_tfxio = tf_example_record.TFExampleRecord( file_pattern=input_uri, telemetry_descriptors=_TELEMETRY_DESCRIPTORS) output_uri = artifact_utils.get_split_uri( output_dict[STATISTICS_KEY], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) data = p | 'TFXIORead[{}]'.format( split) >> input_tfxio.BeamSource() _ = (data | 'GenerateStatistics[{}]'.format(split) >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput[{}]'.format(split) >> stats_api.WriteStatisticsToTFRecord(output_path)) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def generate_statistics_from_tfrecord( data_location: Text, output_path: Optional[bytes] = None, stats_options: options.StatsOptions = options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, compression_type: Text = CompressionTypes.AUTO, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Compute data statistics from TFRecord files containing TFExamples. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. compression_type: Used to handle compressed input files. Default value is CompressionTypes.AUTO, in which case the file_path's extension will be used to detect the compression. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) batch_size = (stats_options.desired_batch_size if stats_options.desired_batch_size and stats_options.desired_batch_size > 0 else constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # Auto detect tfrecord file compression format based on input data # path suffix. _ = ( p | 'ReadData' >> beam.io.ReadFromTFRecord( file_pattern=data_location, compression_type=compression_type) | 'DecodeData' >> tf_example_decoder.DecodeTFExample(desired_batch_size=batch_size) | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) # TODO(b/112014711) Implement a custom sink to write the stats proto. | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(output_path)) return stats_util.load_statistics(output_path)
def validate_examples_in_tfrecord( data_location: Text, stats_options: options.StatsOptions, output_path: Optional[Text] = None, pipeline_options: Optional[PipelineOptions] = None, num_sampled_examples=0, ) -> Union[statistics_pb2.DatasetFeatureStatisticsList, Tuple[ statistics_pb2.DatasetFeatureStatisticsList, Mapping[ str, List[tf.train.Example]]]]: """Validates TFExamples in TFRecord files. Runs a Beam pipeline to detect anomalies on a per-example basis. If this function detects anomalous examples, it generates summary statistics regarding the set of examples that exhibit each anomaly. This is a convenience function for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples' PTransform API directly instead. Args: data_location: The location of the input data files. stats_options: `tfdv.StatsOptions` for generating data statistics. This must contain a schema. output_path: The file path to output data statistics result to. If None, the function uses a temporary directory. The output will be a TFRecord file containing a single data statistics list proto, and can be read with the 'load_statistics' function. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. num_sampled_examples: If set, returns up to this many examples of each anomaly type as a map from anomaly reason string to a list of tf.Examples. Returns: If num_sampled_examples is zero, returns a single DatasetFeatureStatisticsList proto in which each dataset consists of the set of examples that exhibit a particular anomaly. If num_sampled_examples is nonzero, returns the same statistics proto as well as a mapping from anomaly to a list of tf.Examples that exhibited that anomaly. Raises: ValueError: If the specified stats_options does not include a schema. """ if stats_options.schema is None: raise ValueError('The specified stats_options must include a schema.') if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'anomaly_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) with io_util.Materializer(output_dir_path) as sample_materializer: with beam.Pipeline(options=pipeline_options) as p: anomalous_examples = ( p | 'ReadData' >> (tf_example_record.TFExampleRecord( file_pattern=data_location, schema=None, telemetry_descriptors=[ 'tfdv', 'validate_examples_in_tfrecord' ]).BeamSource(batch_size=1)) | 'DetectAnomalies' >> validation_api.IdentifyAnomalousExamples(stats_options)) _ = (anomalous_examples | 'GenerateSummaryStatistics' >> stats_impl.GenerateSlicedStatisticsImpl( stats_options, is_slicing_enabled=True) | 'WriteStatsOutput' >> stats_api.WriteStatisticsToTFRecord(output_path)) if num_sampled_examples: # TODO(b/68154497): Relint # pylint: disable=no-value-for-parameter _ = ( anomalous_examples | 'Sample' >> beam.combiners.Sample.FixedSizePerKey(num_sampled_examples) | 'ToExample' >> _record_batch_to_example_fn( example_coder.RecordBatchToExamplesEncoder( stats_options.schema)) | 'WriteSamples' >> sample_materializer.writer()) # pylint: enable=no-value-for-parameter if num_sampled_examples: samples_per_reason = collections.defaultdict(list) for reason, serialized_example in sample_materializer.reader(): samples_per_reason[reason].append( tf.train.Example.FromString(serialized_example)) return stats_util.load_statistics(output_path), samples_per_reason return stats_util.load_statistics(output_path)