Esempio n. 1
0
 def test_stats_options_with_slice_fns_to_json(self):
     slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
     options = stats_options.StatsOptions(
         experimental_slice_functions=slice_functions)
     with self.assertRaisesRegex(ValueError,
                                 'StatsOptions cannot be converted'):
         options.to_json()
 def test_get_feature_value_slicer_non_utf8_slice_key(self):
   features = {'a': None}
   input_table = pa.Table.from_arrays([
       pa.array([[b'\xF0'], ['cat']]),
   ], ['a'])
   with self.assertRaisesRegexp(ValueError, 'must be valid UTF-8'):
     _ = list(slicing_util.get_feature_value_slicer(features)(input_table))
 def test_get_feature_value_slicer_no_slice(self):
   features = {'a': [3]}
   input_table = pa.Table.from_arrays([
       pa.array([[1], [2, 1]]),
       pa.array([['dog'], ['cat']]),
   ], ['a', 'b'])
   expected_result = []
   self.assertCountEqual(
       expected_result,
       list(slicing_util.get_feature_value_slicer(features)(input_table)))
Esempio n. 4
0
 def test_get_feature_value_slicer_no_slice(self):
     features = {'a': [3]}
     input_record_batch = pa.RecordBatch.from_arrays([
         pa.array([[1], [2, 1]]),
         pa.array([['dog'], ['cat']]),
     ], ['a', 'b'])
     expected_result = []
     self._check_results(
         slicing_util.get_feature_value_slicer(features)(
             input_record_batch), expected_result)
Esempio n. 5
0
 def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
     features = {'b': None}
     input_record_batch = pa.RecordBatch.from_arrays([
         pa.array([[1], [2, 1]]),
         pa.array([[b'dog'], [b'cat']]),
     ], ['a', 'b'])
     expected_result = [
         (u'b_dog',
          pa.RecordBatch.from_arrays(
              [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])),
         (u'b_cat',
          pa.RecordBatch.from_arrays(
              [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])),
     ]
     self._check_results(
         slicing_util.get_feature_value_slicer(features)(
             input_record_batch), expected_result)
Esempio n. 6
0
 def test_get_feature_value_slicer_one_feature_not_in_batch(self):
   features = {'not_an_actual_feature': None, 'a': None}
   input_record_batch = pa.RecordBatch.from_arrays([
       pa.array([[1], [2, 1]]),
       pa.array([['dog'], ['cat']]),
   ], ['a', 'b'])
   expected_result = [
       (u'a_1',
        pa.RecordBatch.from_arrays(
            [pa.array([[1], [2, 1]]),
             pa.array([['dog'], ['cat']])], ['a', 'b'])),
       (u'a_2',
        pa.RecordBatch.from_arrays(
            [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])),
   ]
   self._check_results(
       slicing_util.get_feature_value_slicer(features)(input_record_batch),
       expected_result)
 def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
   features = {'b': None}
   input_table = pa.Table.from_arrays([
       pa.array([[1], [2, 1]]),
       pa.array([[b'dog'], [b'cat']]),
   ], ['a', 'b'])
   expected_result = [
       (u'b_dog',
        pa.Table.from_arrays(
            [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])
       ),
       (u'b_cat',
        pa.Table.from_arrays(
            [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])
       ),
   ]
   self.assertCountEqual(
       expected_result,
       list(slicing_util.get_feature_value_slicer(features)(input_table)))
 def test_get_feature_value_slicer(self):
   features = {'a': None, 'b': None}
   input_table = pa.Table.from_arrays([
       pa.array([[1], [2, 1], [3], [2, 1, 1], [3]]),
       pa.array([['dog'], ['cat'], ['wolf'], ['dog', 'wolf'], ['wolf']]),
   ], ['a', 'b'])
   expected_result = [
       (u'a_1_b_dog',
        pa.Table.from_arrays(
            [pa.array([[1], [2, 1, 1]]), pa.array([['dog'], ['dog', 'wolf']])],
            ['a', 'b'])
       ),
       (u'a_1_b_cat',
        pa.Table.from_arrays(
            [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])
       ),
       (u'a_2_b_cat',
        pa.Table.from_arrays(
            [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])
       ),
       (u'a_2_b_dog',
        pa.Table.from_arrays(
            [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b'])
       ),
       (u'a_1_b_wolf',
        pa.Table.from_arrays(
            [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])],
            ['a', 'b'])
       ),
       (u'a_2_b_wolf',
        pa.Table.from_arrays(
            [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])],
            ['a', 'b'])
       ),
       (u'a_3_b_wolf',
        pa.Table.from_arrays(
            [pa.array([[3], [3]]), pa.array([['wolf'], ['wolf']])],
            ['a', 'b'])
       ),
   ]
   self.assertCountEqual(
       expected_result,
       list(slicing_util.get_feature_value_slicer(features)(input_table)))
Esempio n. 9
0
 def test_get_feature_value_slicer(self):
   features = {'a': None, 'b': None}
   input_record_batch = pa.RecordBatch.from_arrays([
       pa.array([[1], [2, 1], [3], [2, 1, 1], [3]]),
       pa.array([['dog'], ['cat'], ['wolf'], ['dog', 'wolf'], ['wolf']]),
   ], ['a', 'b'])
   expected_result = [
       (u'a_1_b_dog',
        pa.RecordBatch.from_arrays(
            [pa.array([[1], [2, 1, 1]]), pa.array([['dog'], ['dog', 'wolf']])],
            ['a', 'b'])
       ),
       (u'a_1_b_cat',
        pa.RecordBatch.from_arrays(
            [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])
       ),
       (u'a_2_b_cat',
        pa.RecordBatch.from_arrays(
            [pa.array([[2, 1]]), pa.array([['cat']])], ['a', 'b'])
       ),
       (u'a_2_b_dog',
        pa.RecordBatch.from_arrays(
            [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])], ['a', 'b'])
       ),
       (u'a_1_b_wolf',
        pa.RecordBatch.from_arrays(
            [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])],
            ['a', 'b'])
       ),
       (u'a_2_b_wolf',
        pa.RecordBatch.from_arrays(
            [pa.array([[2, 1, 1]]), pa.array([['dog', 'wolf']])],
            ['a', 'b'])
       ),
       (u'a_3_b_wolf',
        pa.RecordBatch.from_arrays(
            [pa.array([[3], [3]]), pa.array([['wolf'], ['wolf']])],
            ['a', 'b'])
       ),
   ]
   self._check_results(
       slicing_util.get_feature_value_slicer(features)(input_record_batch),
       expected_result)
Esempio n. 10
0
    def test_stats_options_json_round_trip(self):
        generators = [
            lift_stats_generator.LiftStatsGenerator(
                schema=None,
                y_path=types.FeaturePath(['label']),
                x_paths=[types.FeaturePath(['feature'])])
        ]
        feature_whitelist = ['a']
        schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')])
        label_feature = 'label'
        weight_feature = 'weight'
        slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
        sample_rate = 0.01
        num_top_values = 21
        frequency_threshold = 2
        weighted_frequency_threshold = 2.0
        num_rank_histogram_buckets = 1001
        num_values_histogram_buckets = 11
        num_histogram_buckets = 11
        num_quantiles_histogram_buckets = 11
        epsilon = 0.02
        infer_type_from_schema = True
        desired_batch_size = 100
        enable_semantic_domain_stats = True
        semantic_domain_stats_sample_rate = 0.1

        options = stats_options.StatsOptions(
            generators=generators,
            feature_whitelist=feature_whitelist,
            schema=schema,
            label_feature=label_feature,
            weight_feature=weight_feature,
            slice_functions=slice_functions,
            sample_rate=sample_rate,
            num_top_values=num_top_values,
            frequency_threshold=frequency_threshold,
            weighted_frequency_threshold=weighted_frequency_threshold,
            num_rank_histogram_buckets=num_rank_histogram_buckets,
            num_values_histogram_buckets=num_values_histogram_buckets,
            num_histogram_buckets=num_histogram_buckets,
            num_quantiles_histogram_buckets=num_quantiles_histogram_buckets,
            epsilon=epsilon,
            infer_type_from_schema=infer_type_from_schema,
            desired_batch_size=desired_batch_size,
            enable_semantic_domain_stats=enable_semantic_domain_stats,
            semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate
        )

        options_json = options.to_json()
        options = stats_options.StatsOptions.from_json(options_json)

        self.assertIsNone(options.generators)
        self.assertEqual(feature_whitelist, options.feature_whitelist)
        compare.assertProtoEqual(self, schema, options.schema)
        self.assertEqual(label_feature, options.label_feature)
        self.assertEqual(weight_feature, options.weight_feature)
        self.assertIsNone(options.slice_functions)
        self.assertEqual(sample_rate, options.sample_rate)
        self.assertEqual(num_top_values, options.num_top_values)
        self.assertEqual(frequency_threshold, options.frequency_threshold)
        self.assertEqual(weighted_frequency_threshold,
                         options.weighted_frequency_threshold)
        self.assertEqual(num_rank_histogram_buckets,
                         options.num_rank_histogram_buckets)
        self.assertEqual(num_values_histogram_buckets,
                         options.num_values_histogram_buckets)
        self.assertEqual(num_histogram_buckets, options.num_histogram_buckets)
        self.assertEqual(num_quantiles_histogram_buckets,
                         options.num_quantiles_histogram_buckets)
        self.assertEqual(epsilon, options.epsilon)
        self.assertEqual(infer_type_from_schema,
                         options.infer_type_from_schema)
        self.assertEqual(desired_batch_size, options.desired_batch_size)
        self.assertEqual(enable_semantic_domain_stats,
                         options.enable_semantic_domain_stats)
        self.assertEqual(semantic_domain_stats_sample_rate,
                         options.semantic_domain_stats_sample_rate)
Esempio n. 11
0
import tensorflow_data_validation as tfdv
import pandas as pd
import datetime

from tensorflow_data_validation.utils import slicing_util

data_location = '/home/jarekk/workspace/test.csv'
output_location = '/home/jarekk/workspace/stats.pb'

slice_fn = slicing_util.get_feature_value_slicer(
    features={'time_window': None})

stats_options = tfdv.StatsOptions(slice_functions=[slice_fn])

stats = tfdv.generate_statistics_from_csv(data_location,
                                          stats_options=stats_options,
                                          output_path=output_location)
Esempio n. 12
0
  def test_stats_options_json_round_trip(self):
    generators = [
        lift_stats_generator.LiftStatsGenerator(
            schema=None,
            y_path=types.FeaturePath(['label']),
            x_paths=[types.FeaturePath(['feature'])])
    ]
    feature_allowlist = ['a']
    schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')])
    vocab_paths = {'a': '/path/to/a'}
    label_feature = 'label'
    weight_feature = 'weight'
    slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
    sample_rate = 0.01
    num_top_values = 21
    frequency_threshold = 2
    weighted_frequency_threshold = 2.0
    num_rank_histogram_buckets = 1001
    num_values_histogram_buckets = 11
    num_histogram_buckets = 11
    num_quantiles_histogram_buckets = 11
    epsilon = 0.02
    infer_type_from_schema = True
    desired_batch_size = 100
    enable_semantic_domain_stats = True
    semantic_domain_stats_sample_rate = 0.1
    per_feature_weight_override = {types.FeaturePath(['a']): 'w'}
    add_default_generators = True
    use_sketch_based_topk_uniques = True

    options = stats_options.StatsOptions(
        generators=generators,
        feature_allowlist=feature_allowlist,
        schema=schema,
        vocab_paths=vocab_paths,
        label_feature=label_feature,
        weight_feature=weight_feature,
        experimental_slice_functions=slice_functions,
        sample_rate=sample_rate,
        num_top_values=num_top_values,
        frequency_threshold=frequency_threshold,
        weighted_frequency_threshold=weighted_frequency_threshold,
        num_rank_histogram_buckets=num_rank_histogram_buckets,
        num_values_histogram_buckets=num_values_histogram_buckets,
        num_histogram_buckets=num_histogram_buckets,
        num_quantiles_histogram_buckets=num_quantiles_histogram_buckets,
        epsilon=epsilon,
        infer_type_from_schema=infer_type_from_schema,
        desired_batch_size=desired_batch_size,
        enable_semantic_domain_stats=enable_semantic_domain_stats,
        semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate,
        per_feature_weight_override=per_feature_weight_override,
        add_default_generators=add_default_generators,
        experimental_use_sketch_based_topk_uniques=use_sketch_based_topk_uniques
    )

    options_json = options.to_json()
    options = stats_options.StatsOptions.from_json(options_json)

    self.assertIsNone(options.generators)
    self.assertEqual(feature_allowlist, options.feature_allowlist)
    compare.assertProtoEqual(self, schema, options.schema)
    self.assertEqual(vocab_paths, options.vocab_paths)
    self.assertEqual(label_feature, options.label_feature)
    self.assertEqual(weight_feature, options.weight_feature)
    self.assertIsNone(options.experimental_slice_functions)
    self.assertEqual(sample_rate, options.sample_rate)
    self.assertEqual(num_top_values, options.num_top_values)
    self.assertEqual(frequency_threshold, options.frequency_threshold)
    self.assertEqual(weighted_frequency_threshold,
                     options.weighted_frequency_threshold)
    self.assertEqual(num_rank_histogram_buckets,
                     options.num_rank_histogram_buckets)
    self.assertEqual(num_values_histogram_buckets,
                     options.num_values_histogram_buckets)
    self.assertEqual(num_histogram_buckets, options.num_histogram_buckets)
    self.assertEqual(num_quantiles_histogram_buckets,
                     options.num_quantiles_histogram_buckets)
    self.assertEqual(epsilon, options.epsilon)
    self.assertEqual(infer_type_from_schema, options.infer_type_from_schema)
    self.assertEqual(desired_batch_size, options.desired_batch_size)
    self.assertEqual(enable_semantic_domain_stats,
                     options.enable_semantic_domain_stats)
    self.assertEqual(semantic_domain_stats_sample_rate,
                     options.semantic_domain_stats_sample_rate)
    self.assertEqual(per_feature_weight_override,
                     options._per_feature_weight_override)
    self.assertEqual(add_default_generators, options.add_default_generators)
    self.assertEqual(use_sketch_based_topk_uniques,
                     options.experimental_use_sketch_based_topk_uniques)
Esempio n. 13
0
 def test_get_feature_value_slicer_for_multivalent_features(
         self, features, example, expected_slice_keys):
     slicer_function = slicing_util.get_feature_value_slicer(features)
     actual_slice_keys = slicer_function(example)
     self.assertCountEqual(expected_slice_keys, actual_slice_keys)
Esempio n. 14
0
 def test_get_feature_value_slicer_with_values_not_in_iterable(self):
     with self.assertRaisesRegexp(
             TypeError, 'Feature values must be specified '
             'in an iterable.'):
         slicing_util.get_feature_value_slicer({'feature_name': 1})
Esempio n. 15
0
 def test_get_feature_value_slicer_with_float_feature_value(self):
     with self.assertRaisesRegexp(
             NotImplementedError,
             'Only string and int.*as the slice value.'):
         slicing_util.get_feature_value_slicer({'feature_name': [1.1]})