def generate_partial_statistics_in_memory( examples, options, stats_generators ): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. stats_generators: A list of statistics generators. Returns: A list of accumulators containing partial statistics. """ batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } return [ generator.add_input(generator.create_accumulator(), batch) for generator in stats_generators # pytype: disable=attribute-error ]
def generate_statistics_in_memory(examples, options=stats_options.StatsOptions()): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = _get_generators(options, in_memory=True) batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } outputs = [ generator.extract_output( generator.add_input(generator.create_accumulator(), batch)) # The type checker raises a false positive here because the type hint for # the return value of _get_generators (which created the list of # stats_generators) is StatsGenerator, but add_input, create_accumulator, # and extract_output can be called only on CombinerStatsGenerators. for generator in stats_generators # pytype: disable=attribute-error ] return _make_dataset_feature_statistics_list_proto( [_merge_dataset_feature_stats_protos(outputs)])
def generate_partial_statistics_in_memory( examples, options, stats_generators ): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. stats_generators: A list of statistics generators. Returns: A list of accumulators containing partial statistics. """ result = [] batch = None for generator in stats_generators: if isinstance(generator, stats_generator.CombinerStatsGenerator): if batch is None: batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } result.append(generator.add_input(generator.create_accumulator(), batch)) else: raise TypeError('Only stats_generator.CombinerStatsGenerator is ' 'expected for now') return result
def _maybe_do_batch( self, accumulator, force = False): """Maybe updates accumulator in place. Checks if accumulator has enough examples for a batch, and if so, does the stats computation for the batch and updates accumulator in place. Args: accumulator: Accumulator. Will be updated in place. force: Force computation of stats even if accumulator has less examples than the batch size. """ batch_size = len(accumulator.input_examples) if (force and batch_size > 0) or batch_size >= self._desired_batch_size: self._combine_add_input_batch_size.update(batch_size) merged_batch = None if self._has_example_batch_combiner_generator: merged_batch = batch_util.merge_single_batch(accumulator.input_examples) def _generator_add_input(gen, gen_accumulator): if isinstance(gen, stats_generator.CombinerStatsGenerator): return gen.add_input(gen_accumulator, merged_batch) else: raise TypeError('Only stats_generator.CombinerStatsGenerator is ' 'expected for now') accumulator.partial_accumulators = self._for_each_generator( _generator_add_input, accumulator.partial_accumulators) del accumulator.input_examples[:]
def generate_statistics_in_memory( examples, options = stats_options.StatsOptions() ): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = _get_default_generators(options, in_memory=True) if options.generators is not None: for generator in options.generators: if isinstance(generator, stats_generator.CombinerStatsGenerator): stats_generators.append(generator) else: raise TypeError('Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of type ' '%s.' % generator.__class__.__name__) batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } outputs = [ generator.extract_output( generator.add_input(generator.create_accumulator(), batch)) # The type checker raises a false positive here because the type hint for # the return value of _get_default_generators (which created the list of # stats_generators) is StatsGenerator, but add_input, create_accumulator, # and extract_output can be called only on CombinerStatsGenerators. for generator in stats_generators # pytype: disable=attribute-error ] return _make_dataset_feature_statistics_list_proto( _merge_dataset_feature_stats_protos(outputs))
def test_merge_single_batch(self): examples = [ { 'a': np.array([1.0, 2.0]), 'b': np.array(['a', 'b', 'c', 'e']) }, { 'a': np.array([3.0, 4.0, np.NaN, 5.0]), }, { 'b': np.array(['d', 'e', 'f']), 'd': np.array([10, 20, 30]), }, { 'b': np.array(['a', 'b', 'c']) }, { 'c': np.array(['d', 'e', 'f']) } ] expected_batch = { 'a': [np.array([1.0, 2.0]), np.array([3.0, 4.0, np.NaN, 5.0]), None, None, None], 'b': [np.array(['a', 'b', 'c', 'e']), None, np.array(['d', 'e', 'f']), np.array(['a', 'b', 'c']), None], 'c': [None, None, None, None, np.array(['d', 'e', 'f'])], 'd': [None, None, np.array([10, 20, 30]), None, None] } actual_batch = batch_util.merge_single_batch(examples) # check number of features. self.assertLen(actual_batch, len(expected_batch)) for feature_name in expected_batch: # check batch size. self.assertLen(actual_batch[feature_name], len(expected_batch[feature_name])) for i in range(len(expected_batch[feature_name])): expected_value = expected_batch[feature_name][i] actual_value = actual_batch[feature_name][i] if expected_value is None: self.assertEqual(actual_value, expected_value) else: # check dtype. self.assertEqual(actual_value.dtype, expected_value.dtype) # check numpy array. np.testing.assert_array_equal(actual_value, expected_value)
def _maybe_do_batch(self, accumulator, force=False): """Maybe updates accumulator in place. Checks if accumulator has enough examples for a batch, and if so, does the stats computation for the batch and updates accumulator in place. Args: accumulator: Accumulator. Will be updated in place. force: Force computation of stats even if accumulator has less examples than the batch size. """ batch_size = len(accumulator.input_examples) if (force and batch_size > 0) or batch_size >= self._desired_batch_size: self._combine_add_input_batch_size.update(batch_size) accumulator.partial_accumulator = self._generator.add_input( accumulator.partial_accumulator, batch_util.merge_single_batch(accumulator.input_examples)) del accumulator.input_examples[:] # Clear processed examples.
def _process_partition(partition, stats_fn): """Process examples in a single partition.""" (slice_key, _), examples = partition return slice_key, stats_fn.compute(batch_util.merge_single_batch(examples))
def generate_statistics_in_memory(examples, options=stats_options.StatsOptions()): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = [ common_stats_generator.CommonStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_values_histogram_buckets=\ options.num_values_histogram_buckets, epsilon=options.epsilon), numeric_stats_generator.NumericStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_histogram_buckets=options.num_histogram_buckets, num_quantiles_histogram_buckets=\ options.num_quantiles_histogram_buckets, epsilon=options.epsilon), string_stats_generator.StringStatsGenerator(schema=options.schema), top_k_uniques_combiner_stats_generator.TopKUniquesCombinerStatsGenerator( schema=options.schema, weight_feature=options.weight_feature, num_top_values=options.num_top_values, num_rank_histogram_buckets=options.num_rank_histogram_buckets), ] if options.generators is not None: for generator in options.generators: if isinstance(generator, stats_generator.CombinerStatsGenerator): stats_generators.append(generator) else: raise TypeError( 'Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of type ' '%s.' % generator.__class__.__name__) batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } outputs = [ generator.extract_output( generator.add_input(generator.create_accumulator(), batch)) for generator in stats_generators ] return _make_dataset_feature_statistics_list_proto( _merge_dataset_feature_stats_protos(outputs))