def test_stats_pipeline_with_sample_count(self):
        # input with three examples.
        examples = [{
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }, {
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }, {
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=1,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
Example #2
0
 def test_stats_options_with_slice_fns_to_json(self):
     slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
     options = stats_options.StatsOptions(
         experimental_slice_functions=slice_functions)
     with self.assertRaisesRegex(ValueError,
                                 'StatsOptions cannot be converted'):
         options.to_json()
Example #3
0
 def test_stats_options_from_json(self):
     options_json = """{
   "_generators": null,
   "_feature_whitelist": null,
   "_schema": null,
   "weight_feature": null,
   "label_feature": null,
   "_slice_functions": null,
   "_sample_rate": null,
   "num_top_values": 20,
   "frequency_threshold": 1,
   "weighted_frequency_threshold": 1.0,
   "num_rank_histogram_buckets": 1000,
   "_num_values_histogram_buckets": 10,
   "_num_histogram_buckets": 10,
   "_num_quantiles_histogram_buckets": 10,
   "epsilon": 0.01,
   "infer_type_from_schema": false,
   "_desired_batch_size": null,
   "enable_semantic_domain_stats": false,
   "_semantic_domain_stats_sample_rate": null
 }"""
     actual_options = stats_options.StatsOptions.from_json(options_json)
     expected_options_dict = stats_options.StatsOptions().__dict__
     self.assertEqual(expected_options_dict, actual_options.__dict__)
 def test_validate_instance_stats_options_without_schema(self):
     instance = {'feature': np.array(['A'])}
     # This instance of StatsOptions has no schema.
     options = stats_options.StatsOptions()
     with self.assertRaisesRegexp(ValueError,
                                  'options must include a schema.'):
         _ = validation_api.validate_instance(instance, options)
    def test_validate_instance_invalid_environment(self):
        instance = {'feature': np.array(['A'])}
        schema = text_format.Parse(
            """
        default_environment: "TRAINING"
        default_environment: "SERVING"
        feature {
          name: "label"
          not_in_environment: "SERVING"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        feature {
          name: "feature"
          value_count { min: 1 max: 1 }
          presence { min_count: 1 }
          type: BYTES
        }
        """, schema_pb2.Schema())
        options = stats_options.StatsOptions(schema=schema)

        with self.assertRaisesRegexp(ValueError,
                                     'Environment.*not found in the schema.*'):
            _ = validation_api.validate_instance(instance,
                                                 options,
                                                 environment='INVALID')
 def setUp(self):
     self._default_stats_options = stats_options.StatsOptions(
         num_top_values=2,
         num_rank_histogram_buckets=2,
         num_values_histogram_buckets=2,
         num_histogram_buckets=2,
         num_quantiles_histogram_buckets=2)
Example #7
0
 def test_stats_options_from_json(self):
     options_json = """{
   "_generators": null,
   "_feature_allowlist": null,
   "_schema": null,
   "_vocab_paths": null,
   "weight_feature": null,
   "label_feature": null,
   "_slice_functions": null,
   "_sample_rate": null,
   "num_top_values": 20,
   "frequency_threshold": 1,
   "weighted_frequency_threshold": 1.0,
   "num_rank_histogram_buckets": 1000,
   "_num_values_histogram_buckets": 10,
   "_num_histogram_buckets": 10,
   "_num_quantiles_histogram_buckets": 10,
   "epsilon": 0.01,
   "infer_type_from_schema": false,
   "_desired_batch_size": null,
   "enable_semantic_domain_stats": false,
   "_semantic_domain_stats_sample_rate": null,
   "_per_feature_weight_override": null,
   "_add_default_generators": true,
   "_use_sketch_based_topk_uniques": false,
   "_slice_sqls": null,
   "_experimental_result_partitions": 1,
   "_experimental_num_feature_partitions": 1
 }"""
     actual_options = stats_options.StatsOptions.from_json(options_json)
     expected_options_dict = stats_options.StatsOptions().__dict__
     self.assertEqual(expected_options_dict, actual_options.__dict__)
    def test_stats_pipeline_with_sample_count(self):
        record_batches = [
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
        ]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=3000,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001,
                desired_batch_size=3000)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
 def test_invalid_sample_rate_negative(self):
     examples = [{}]
     with self.assertRaises(ValueError):
         with beam.Pipeline() as p:
             options = stats_options.StatsOptions(sample_rate=-1)
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options))
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    stats_generators = _get_generators(options, in_memory=True)

    batch = batch_util.merge_single_batch(examples)

    # If whitelist features are provided, keep only those features.
    if options.feature_whitelist:
        batch = {
            feature_name: batch[feature_name]
            for feature_name in options.feature_whitelist
        }

    outputs = [
        generator.extract_output(
            generator.add_input(generator.create_accumulator(), batch))
        # The type checker raises a false positive here because the type hint for
        # the return value of _get_generators (which created the list of
        # stats_generators) is StatsGenerator, but add_input, create_accumulator,
        # and extract_output can be called only on CombinerStatsGenerators.
        for generator in stats_generators  # pytype: disable=attribute-error
    ]

    return _make_dataset_feature_statistics_list_proto(
        [_merge_dataset_feature_stats_protos(outputs)])
 def test_invalid_feature_whitelist(self):
     examples = [{'a': np.array([1.0, 2.0])}]
     with self.assertRaises(TypeError):
         with beam.Pipeline() as p:
             options = stats_options.StatsOptions(feature_whitelist={})
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options))
 def test_identify_anomalous_examples(self, examples, schema_text,
                                      expected_result):
     schema = text_format.Parse(schema_text, schema_pb2.Schema())
     options = stats_options.StatsOptions(schema=schema)
     with beam.Pipeline() as p:
         result = (p | beam.Create(examples)
                   | validation_api.IdentifyAnomalousExamples(options))
         util.assert_that(result, util.equal_to(expected_result))
 def test_identify_anomalous_examples_options_without_schema(self):
     examples = [{'annotated_enum': np.array(['D'])}]
     options = stats_options.StatsOptions()
     with self.assertRaisesRegexp(ValueError,
                                  'options must include a schema'):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples)
                  | validation_api.IdentifyAnomalousExamples(options))
 def test_invalid_both_sample_count_and_sample_rate(self):
     examples = [{}]
     with self.assertRaises(ValueError):
         with beam.Pipeline() as p:
             options = stats_options.StatsOptions(sample_count=100,
                                                  sample_rate=0.5)
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options))
 def test_validate_instance_global_only_anomaly_type(self):
     instance = {'annotated_enum': np.array(['D'])}
     # This schema has a presence.min_count > 1, which will generate an anomaly
     # of type FEATURE_TYPE_LOW_NUMBER_PRESENT when any single example is
     # validated using this schema. This test checks that this anomaly type
     # (which is not meaningful in per-example validation) is not included in the
     # Anomalies proto that validate_instance returns.
     schema = text_format.Parse(
         """
     string_domain {
       name: "MyAloneEnum"
       value: "A"
       value: "B"
       value: "C"
     }
     feature {
       name: "annotated_enum"
       value_count {
         min:1
         max:1
       }
       presence {
         min_count: 5
       }
       type: BYTES
       domain: "MyAloneEnum"
     }
     feature {
       name: "ignore_this"
       lifecycle_stage: DEPRECATED
       value_count {
         min:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_anomalies = {
         'annotated_enum':
         text_format.Parse(
             """
   description: "Examples contain values missing from the schema: D "
     "(~100%). "
   severity: ERROR
   short_description: "Unexpected string values"
   reason {
     type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
     short_description: "Unexpected string values"
     description: "Examples contain values missing from the schema: D "
       "(~100%). "
   }
         """, anomalies_pb2.AnomalyInfo())
     }
     options = stats_options.StatsOptions(schema=schema)
     anomalies = validation_api.validate_instance(instance, options)
     self._assert_equal_anomalies(anomalies, expected_anomalies)
def generate_statistics_from_tfrecord(
        data_location,
        output_path=None,
        stats_options=options.StatsOptions(),
        pipeline_options=None,
):
    """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.gfile.Exists(output_dir_path):
        tf.gfile.MakeDirs(output_dir_path)

    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # Auto detect tfrecord file compression format based on input data
        # path suffix.
        _ = (
            p
            |
            'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location)
            | 'DecodeData' >> tf_example_decoder.DecodeTFExample()
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            # TODO(b/112014711) Implement a custom sink to write the stats proto.
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))
    return load_statistics(output_path)
 def test_validate_examples_in_tfrecord_no_schema(self):
   temp_dir_path = self.create_tempdir().full_path
   input_data_path = os.path.join(temp_dir_path, 'input_data.tfrecord')
   # By default, StatsOptions does not include a schema.
   options = stats_options.StatsOptions()
   with self.assertRaisesRegexp(
       ValueError, 'The specified stats_options must include a schema.'):
     validation_lib.validate_examples_in_tfrecord(
         data_location=input_data_path, stats_options=options)
def generate_statistics_from_tfrecord(
    data_location: Text,
    output_path: Optional[bytes] = None,
    stats_options: options.StatsOptions = options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
  """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  if output_path is None:
    output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
  output_dir_path = os.path.dirname(output_path)
  if not tf.io.gfile.exists(output_dir_path):
    tf.io.gfile.makedirs(output_dir_path)

  batch_size = stats_options.desired_batch_size
  # PyLint doesn't understand Beam PTransforms.
  # pylint: disable=no-value-for-parameter
  with beam.Pipeline(options=pipeline_options) as p:
    # Auto detect tfrecord file compression format based on input data
    # path suffix.
    _ = (
        p
        | 'ReadData' >> (tf_example_record.TFExampleRecord(
            file_pattern=data_location,
            schema=None,
            telemetry_descriptors=['tfdv', 'generate_statistics_from_tfrecord'])
                         .BeamSource(batch_size))
        | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
        | 'WriteStatsOutput' >>
        (stats_api.WriteStatisticsToTFRecord(output_path)))
  return stats_util.load_statistics(output_path)
 def test_identify_anomalous_examples_invalid_max_examples_type(self):
     examples = [{'annotated_enum': np.array(['D'])}]
     options = stats_options.StatsOptions(schema=schema_pb2.Schema())
     max_examples_per_anomaly = 1.5
     with self.assertRaisesRegexp(
             TypeError, 'max_examples_per_anomaly must be an integer.'):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples)
                  | validation_api.IdentifyAnomalousExamples(
                      options, max_examples_per_anomaly))
Example #20
0
 def test_stats_options_with_generators_to_json(self):
     generators = [
         lift_stats_generator.LiftStatsGenerator(
             schema=None,
             y_path=types.FeaturePath(['label']),
             x_paths=[types.FeaturePath(['feature'])])
     ]
     options = stats_options.StatsOptions(generators=generators)
     with self.assertRaisesRegex(ValueError,
                                 'StatsOptions cannot be converted'):
         options.to_json()
Example #21
0
  def test_example_weight_map(self):
    options = stats_options.StatsOptions()
    self.assertIsNone(options.example_weight_map.get(types.FeaturePath(['f'])))
    self.assertEqual(frozenset([]),
                     options.example_weight_map.all_weight_features())

    options = stats_options.StatsOptions(weight_feature='w')
    self.assertEqual('w',
                     options.example_weight_map.get(types.FeaturePath(['f'])))
    self.assertEqual(
        frozenset(['w']),
        options.example_weight_map.all_weight_features())

    options = stats_options.StatsOptions(
        per_feature_weight_override={types.FeaturePath(['x']): 'w'})
    self.assertIsNone(options.example_weight_map.get(types.FeaturePath(['f'])))
    self.assertEqual('w',
                     options.example_weight_map.get(types.FeaturePath(['x'])))
    self.assertEqual(frozenset(['w']),
                     options.example_weight_map.all_weight_features())
 def test_validate_instance(self):
     instance = {'annotated_enum': np.array(['D'])}
     schema = text_format.Parse(
         """
     string_domain {
       name: "MyAloneEnum"
       value: "A"
       value: "B"
       value: "C"
     }
     feature {
       name: "annotated_enum"
       value_count {
         min:1
         max:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
       domain: "MyAloneEnum"
     }
     feature {
       name: "ignore_this"
       lifecycle_stage: DEPRECATED
       value_count {
         min:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_anomalies = {
         'annotated_enum':
         text_format.Parse(
             """
   description: "Examples contain values missing from the schema: D "
     "(~100%). "
   severity: ERROR
   short_description: "Unexpected string values"
   reason {
     type: ENUM_TYPE_UNEXPECTED_STRING_VALUES
     short_description: "Unexpected string values"
     description: "Examples contain values missing from the schema: D "
       "(~100%). "
   }
         """, anomalies_pb2.AnomalyInfo())
     }
     options = stats_options.StatsOptions(schema=schema)
     anomalies = validation_api.validate_instance(instance, options)
     self._assert_equal_anomalies(anomalies, expected_anomalies)
Example #23
0
    def __init__(self, options=stats_options.StatsOptions()):
        """Initializes the transform.

    Args:
      options: Options for generating data statistics.

    Raises:
      TypeError: If any of the input options is not of the expected type.
      ValueError: If any of the input options is invalid.
    """

        self._check_options(options)
        self._options = options
Example #24
0
    def __init__(self, options=stats_options.StatsOptions()):
        """Initializes the transform.

    Args:
      options: `tfdv.StatsOptions` for generating data statistics.

    Raises:
      TypeError: If options is not of the expected type.
    """
        if not isinstance(options, stats_options.StatsOptions):
            raise TypeError(
                'options is of type %s, should be a StatsOptions.' %
                type(options).__name__)
        self._options = options
Example #25
0
def generate_statistics_from_dataframe(
        dataframe: DataFrame,
        stats_options: options.StatsOptions = options.StatsOptions(),
        n_jobs: int = 1) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Compute data statistics for the input pandas DataFrame.

  This is a utility method for users with in-memory data represented
  as a pandas DataFrame.

  Args:
    dataframe: Input pandas DataFrame.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    n_jobs: Number of processes to run (defaults to 1). If -1 is provided,
      uses the same number of processes as the number of CPU cores.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if not isinstance(dataframe, DataFrame):
        raise TypeError('dataframe argument is of type {}. Must be a '
                        'pandas DataFrame.'.format(type(dataframe).__name__))

    stats_generators = cast(
        List[stats_generator.CombinerStatsGenerator],
        stats_impl.get_generators(stats_options, in_memory=True))
    if n_jobs < -1 or n_jobs == 0:
        raise ValueError('Invalid n_jobs parameter {}. Should be either '
                         ' -1 or >= 1.'.format(n_jobs))

    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    n_jobs = max(min(n_jobs, multiprocessing.cpu_count()), 1)

    if n_jobs == 1:
        merged_partial_stats = _generate_partial_statistics_from_df(
            dataframe, stats_options, stats_generators)
    else:
        # TODO(pachristopher): Investigate why we don't observe linear speedup after
        # a certain number of processes.
        splits = np.array_split(dataframe, n_jobs)
        partial_stats = Parallel(n_jobs=n_jobs)(
            delayed(_generate_partial_statistics_from_df)(
                splits[i], stats_options, stats_generators)
            for i in range(n_jobs))
        merged_partial_stats = [
            gen.merge_accumulators(stats)
            for gen, stats in zip(stats_generators, zip(*partial_stats))
        ]
    return stats_impl.extract_statistics_output(merged_partial_stats,
                                                stats_generators)
Example #26
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_uris = []
        for artifact in input_dict['input_data']:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
    def test_generate_statistics_in_memory_invalid_custom_generator(self):

        # Dummy PTransform that does nothing.
        class CustomPTransform(beam.PTransform):
            def expand(self, pcoll):
                pass

        examples = [{'a': np.array([1.0])}]
        custom_generator = stats_generator.TransformStatsGenerator(
            name='CustomStatsGenerator', ptransform=CustomPTransform())
        options = stats_options.StatsOptions(generators=[custom_generator])
        with self.assertRaisesRegexp(
                TypeError, 'Statistics generator.* found object of type '
                'TransformStatsGenerator.'):
            stats_impl.generate_statistics_in_memory(examples, options)
 def test_empty_input(self):
     examples = []
     expected_result = text_format.Parse(
         """
 datasets {
   num_examples: 0
 }
 """, statistics_pb2.DatasetFeatureStatisticsList())
     with beam.Pipeline() as p:
         result = p | beam.Create(examples) | stats_api.GenerateStatistics(
             stats_options.StatsOptions())
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
Example #29
0
def generate_statistics_in_memory(examples,
                                  options=stats_options.StatsOptions()):
    """Generates statistics for an in-memory list of examples.

  Args:
    examples: A list of input examples.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    stats_generators = get_generators(options, in_memory=True)
    partial_stats = generate_partial_statistics_in_memory(
        examples, options, stats_generators)
    return extract_statistics_output(partial_stats, stats_generators)
Example #30
0
 def test_stats_options_invalid_slicing_sql_query(self):
     schema = schema_pb2.Schema(feature=[
         schema_pb2.Feature(name='feat1', type=schema_pb2.BYTES),
         schema_pb2.Feature(name='feat3', type=schema_pb2.INT)
     ], )
     experimental_slice_sqls = [
         """
     SELECT
       STRUCT(feat1, feat2)
     FROM
       example.feat1, example.feat2
     """
     ]
     with self.assertRaisesRegex(ValueError, 'One of the slice SQL query'):
         stats_options.StatsOptions(
             experimental_slice_sqls=experimental_slice_sqls, schema=schema)