def test_stats_options_with_slice_fns_to_json(self): slice_functions = [slicing_util.get_feature_value_slicer({'b': None})] options = stats_options.StatsOptions( experimental_slice_functions=slice_functions) with self.assertRaisesRegex(ValueError, 'StatsOptions cannot be converted'): options.to_json()
def test_get_feature_value_slicer_non_utf8_slice_key(self): features = {'a': None} input_table = pa.Table.from_arrays([ pa.array([[b'\xF0'], ['cat']]), ], ['a']) with self.assertRaisesRegexp(ValueError, 'must be valid UTF-8'): _ = list(slicing_util.get_feature_value_slicer(features)(input_table))
def test_get_feature_value_slicer_no_slice(self): features = {'a': [3]} input_table = pa.Table.from_arrays([ pa.array([[1], [2, 1]]), pa.array([['dog'], ['cat']]), ], ['a', 'b']) expected_result = [] self.assertCountEqual( expected_result, list(slicing_util.get_feature_value_slicer(features)(input_table)))
def test_get_feature_value_slicer_no_slice(self): features = {'a': [3]} input_record_batch = pa.RecordBatch.from_arrays([ pa.array([[1], [2, 1]]), pa.array([['dog'], ['cat']]), ], ['a', 'b']) expected_result = [] self._check_results( slicing_util.get_feature_value_slicer(features)( input_record_batch), expected_result)
def test_get_feature_value_slicer_bytes_feature_valid_utf8(self): features = {'b': None} input_record_batch = pa.RecordBatch.from_arrays([ pa.array([[1], [2, 1]]), pa.array([[b'dog'], [b'cat']]), ], ['a', 'b']) expected_result = [ (u'b_dog', pa.RecordBatch.from_arrays( [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])), (u'b_cat', pa.RecordBatch.from_arrays( [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])), ] self._check_results( slicing_util.get_feature_value_slicer(features)( input_record_batch), expected_result)
def test_get_feature_value_slicer_one_feature_not_in_batch(self): features = {'not_an_actual_feature': None, 'a': None} input_record_batch = pa.RecordBatch.from_arrays([ pa.array([[1], [2, 1]]), pa.array([['dog'], ['cat']]), ], ['a', 'b']) expected_result = [ (u'a_1', pa.RecordBatch.from_arrays( [pa.array([[1], [2, 1]]), pa.array([['dog'], ['cat']])], ['a', 'b'])), (u'a_2', pa.RecordBatch.from_arrays( [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])), ] self._check_results( slicing_util.get_feature_value_slicer(features)(input_record_batch), expected_result)
def test_get_feature_value_slicer_bytes_feature_valid_utf8(self): features = {'b': None} input_table = pa.Table.from_arrays([ pa.array([[1], [2, 1]]), pa.array([[b'dog'], [b'cat']]), ], ['a', 'b']) expected_result = [ (u'b_dog', pa.Table.from_arrays( [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b']) ), (u'b_cat', pa.Table.from_arrays( [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b']) ), ] self.assertCountEqual( expected_result, list(slicing_util.get_feature_value_slicer(features)(input_table)))
def test_get_feature_value_slicer(self): features = {'a': None, 'b': None} input_table = pa.Table.from_arrays([ pa.array([[1], [2, 1], [3], [2, 1, 1], [3]]), pa.array([['dog'], ['cat'], ['wolf'], ['dog', 'wolf'], ['wolf']]), ], ['a', 'b']) expected_result = [ (u'a_1_b_dog', pa.Table.from_arrays( [pa.array([[1], [2, 1, 1]]), pa.array([['dog'], ['dog', 'wolf']])], ['a', 'b']) ), (u'a_1_b_cat', pa.Table.from_arrays( [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b']) ), (u'a_2_b_cat', pa.Table.from_arrays( [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b']) ), (u'a_2_b_dog', pa.Table.from_arrays( [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b']) ), (u'a_1_b_wolf', pa.Table.from_arrays( [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b']) ), (u'a_2_b_wolf', pa.Table.from_arrays( [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b']) ), (u'a_3_b_wolf', pa.Table.from_arrays( [pa.array([[3], [3]]), pa.array([['wolf'], ['wolf']])], ['a', 'b']) ), ] self.assertCountEqual( expected_result, list(slicing_util.get_feature_value_slicer(features)(input_table)))
def test_get_feature_value_slicer(self): features = {'a': None, 'b': None} input_record_batch = pa.RecordBatch.from_arrays([ pa.array([[1], [2, 1], [3], [2, 1, 1], [3]]), pa.array([['dog'], ['cat'], ['wolf'], ['dog', 'wolf'], ['wolf']]), ], ['a', 'b']) expected_result = [ (u'a_1_b_dog', pa.RecordBatch.from_arrays( [pa.array([[1], [2, 1, 1]]), pa.array([['dog'], ['dog', 'wolf']])], ['a', 'b']) ), (u'a_1_b_cat', pa.RecordBatch.from_arrays( [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b']) ), (u'a_2_b_cat', pa.RecordBatch.from_arrays( [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b']) ), (u'a_2_b_dog', pa.RecordBatch.from_arrays( [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b']) ), (u'a_1_b_wolf', pa.RecordBatch.from_arrays( [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b']) ), (u'a_2_b_wolf', pa.RecordBatch.from_arrays( [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b']) ), (u'a_3_b_wolf', pa.RecordBatch.from_arrays( [pa.array([[3], [3]]), pa.array([['wolf'], ['wolf']])], ['a', 'b']) ), ] self._check_results( slicing_util.get_feature_value_slicer(features)(input_record_batch), expected_result)
def test_stats_options_json_round_trip(self): generators = [ lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['label']), x_paths=[types.FeaturePath(['feature'])]) ] feature_whitelist = ['a'] schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')]) label_feature = 'label' weight_feature = 'weight' slice_functions = [slicing_util.get_feature_value_slicer({'b': None})] sample_rate = 0.01 num_top_values = 21 frequency_threshold = 2 weighted_frequency_threshold = 2.0 num_rank_histogram_buckets = 1001 num_values_histogram_buckets = 11 num_histogram_buckets = 11 num_quantiles_histogram_buckets = 11 epsilon = 0.02 infer_type_from_schema = True desired_batch_size = 100 enable_semantic_domain_stats = True semantic_domain_stats_sample_rate = 0.1 options = stats_options.StatsOptions( generators=generators, feature_whitelist=feature_whitelist, schema=schema, label_feature=label_feature, weight_feature=weight_feature, slice_functions=slice_functions, sample_rate=sample_rate, num_top_values=num_top_values, frequency_threshold=frequency_threshold, weighted_frequency_threshold=weighted_frequency_threshold, num_rank_histogram_buckets=num_rank_histogram_buckets, num_values_histogram_buckets=num_values_histogram_buckets, num_histogram_buckets=num_histogram_buckets, num_quantiles_histogram_buckets=num_quantiles_histogram_buckets, epsilon=epsilon, infer_type_from_schema=infer_type_from_schema, desired_batch_size=desired_batch_size, enable_semantic_domain_stats=enable_semantic_domain_stats, semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate ) options_json = options.to_json() options = stats_options.StatsOptions.from_json(options_json) self.assertIsNone(options.generators) self.assertEqual(feature_whitelist, options.feature_whitelist) compare.assertProtoEqual(self, schema, options.schema) self.assertEqual(label_feature, options.label_feature) self.assertEqual(weight_feature, options.weight_feature) self.assertIsNone(options.slice_functions) self.assertEqual(sample_rate, options.sample_rate) self.assertEqual(num_top_values, options.num_top_values) self.assertEqual(frequency_threshold, options.frequency_threshold) self.assertEqual(weighted_frequency_threshold, options.weighted_frequency_threshold) self.assertEqual(num_rank_histogram_buckets, options.num_rank_histogram_buckets) self.assertEqual(num_values_histogram_buckets, options.num_values_histogram_buckets) self.assertEqual(num_histogram_buckets, options.num_histogram_buckets) self.assertEqual(num_quantiles_histogram_buckets, options.num_quantiles_histogram_buckets) self.assertEqual(epsilon, options.epsilon) self.assertEqual(infer_type_from_schema, options.infer_type_from_schema) self.assertEqual(desired_batch_size, options.desired_batch_size) self.assertEqual(enable_semantic_domain_stats, options.enable_semantic_domain_stats) self.assertEqual(semantic_domain_stats_sample_rate, options.semantic_domain_stats_sample_rate)
import tensorflow_data_validation as tfdv import pandas as pd import datetime from tensorflow_data_validation.utils import slicing_util data_location = '/home/jarekk/workspace/test.csv' output_location = '/home/jarekk/workspace/stats.pb' slice_fn = slicing_util.get_feature_value_slicer( features={'time_window': None}) stats_options = tfdv.StatsOptions(slice_functions=[slice_fn]) stats = tfdv.generate_statistics_from_csv(data_location, stats_options=stats_options, output_path=output_location)
def test_stats_options_json_round_trip(self): generators = [ lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['label']), x_paths=[types.FeaturePath(['feature'])]) ] feature_allowlist = ['a'] schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')]) vocab_paths = {'a': '/path/to/a'} label_feature = 'label' weight_feature = 'weight' slice_functions = [slicing_util.get_feature_value_slicer({'b': None})] sample_rate = 0.01 num_top_values = 21 frequency_threshold = 2 weighted_frequency_threshold = 2.0 num_rank_histogram_buckets = 1001 num_values_histogram_buckets = 11 num_histogram_buckets = 11 num_quantiles_histogram_buckets = 11 epsilon = 0.02 infer_type_from_schema = True desired_batch_size = 100 enable_semantic_domain_stats = True semantic_domain_stats_sample_rate = 0.1 per_feature_weight_override = {types.FeaturePath(['a']): 'w'} add_default_generators = True use_sketch_based_topk_uniques = True options = stats_options.StatsOptions( generators=generators, feature_allowlist=feature_allowlist, schema=schema, vocab_paths=vocab_paths, label_feature=label_feature, weight_feature=weight_feature, experimental_slice_functions=slice_functions, sample_rate=sample_rate, num_top_values=num_top_values, frequency_threshold=frequency_threshold, weighted_frequency_threshold=weighted_frequency_threshold, num_rank_histogram_buckets=num_rank_histogram_buckets, num_values_histogram_buckets=num_values_histogram_buckets, num_histogram_buckets=num_histogram_buckets, num_quantiles_histogram_buckets=num_quantiles_histogram_buckets, epsilon=epsilon, infer_type_from_schema=infer_type_from_schema, desired_batch_size=desired_batch_size, enable_semantic_domain_stats=enable_semantic_domain_stats, semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate, per_feature_weight_override=per_feature_weight_override, add_default_generators=add_default_generators, experimental_use_sketch_based_topk_uniques=use_sketch_based_topk_uniques ) options_json = options.to_json() options = stats_options.StatsOptions.from_json(options_json) self.assertIsNone(options.generators) self.assertEqual(feature_allowlist, options.feature_allowlist) compare.assertProtoEqual(self, schema, options.schema) self.assertEqual(vocab_paths, options.vocab_paths) self.assertEqual(label_feature, options.label_feature) self.assertEqual(weight_feature, options.weight_feature) self.assertIsNone(options.experimental_slice_functions) self.assertEqual(sample_rate, options.sample_rate) self.assertEqual(num_top_values, options.num_top_values) self.assertEqual(frequency_threshold, options.frequency_threshold) self.assertEqual(weighted_frequency_threshold, options.weighted_frequency_threshold) self.assertEqual(num_rank_histogram_buckets, options.num_rank_histogram_buckets) self.assertEqual(num_values_histogram_buckets, options.num_values_histogram_buckets) self.assertEqual(num_histogram_buckets, options.num_histogram_buckets) self.assertEqual(num_quantiles_histogram_buckets, options.num_quantiles_histogram_buckets) self.assertEqual(epsilon, options.epsilon) self.assertEqual(infer_type_from_schema, options.infer_type_from_schema) self.assertEqual(desired_batch_size, options.desired_batch_size) self.assertEqual(enable_semantic_domain_stats, options.enable_semantic_domain_stats) self.assertEqual(semantic_domain_stats_sample_rate, options.semantic_domain_stats_sample_rate) self.assertEqual(per_feature_weight_override, options._per_feature_weight_override) self.assertEqual(add_default_generators, options.add_default_generators) self.assertEqual(use_sketch_based_topk_uniques, options.experimental_use_sketch_based_topk_uniques)
def test_get_feature_value_slicer_for_multivalent_features( self, features, example, expected_slice_keys): slicer_function = slicing_util.get_feature_value_slicer(features) actual_slice_keys = slicer_function(example) self.assertCountEqual(expected_slice_keys, actual_slice_keys)
def test_get_feature_value_slicer_with_values_not_in_iterable(self): with self.assertRaisesRegexp( TypeError, 'Feature values must be specified ' 'in an iterable.'): slicing_util.get_feature_value_slicer({'feature_name': 1})
def test_get_feature_value_slicer_with_float_feature_value(self): with self.assertRaisesRegexp( NotImplementedError, 'Only string and int.*as the slice value.'): slicing_util.get_feature_value_slicer({'feature_name': [1.1]})