def test_lift_no_categorical_features(self): examples = [ pa.Table.from_arrays([ pa.array([[1.0], [2.0], [3.0], [4.0]]), pa.array([[1], [0], [1], [0]]), ], ['continous_x', 'int_y']), ] schema = text_format.Parse( """ feature { name: 'continuous_x' type: FLOAT } feature { name: 'int_y' type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) expected_result = [] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['int_y'])) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_lift_min_x_count_filters_all(self): examples = [ pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y']), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) expected_result = [] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['string_y']), min_x_count=4) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_stats_options_with_generators_to_json(self): generators = [ lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['label']), x_paths=[types.FeaturePath(['feature'])]) ] options = stats_options.StatsOptions(generators=generators) with self.assertRaisesRegex(ValueError, 'StatsOptions cannot be converted'): options.to_json()
def test_lift_int_y_with_no_boundaries(self): schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'int_y' type: INT } """, schema_pb2.Schema()) with self.assertRaisesRegex(ValueError, r'Boundaries must be provided with a non-' 'categorical y_path.*'): lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['int_y']))
def test_lift_string_y_with_boundaries(self): schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) with self.assertRaisesRegex( ValueError, r'Boundaries cannot be applied to a ' 'categorical y_path.*'): lift_stats_generator.LiftStatsGenerator(schema, y_path=types.FeaturePath( ['string_y']), y_boundaries=[1, 2, 3])
def get_generators(options: stats_options.StatsOptions, in_memory: bool = False ) -> List[stats_generator.StatsGenerator]: """Initializes the list of stats generators, including custom generators. Args: options: A StatsOptions object. in_memory: Whether the generators will be used to generate statistics in memory (True) or using Beam (False). Returns: A list of stats generator objects. """ generators = _get_default_generators(options, in_memory) if options.generators: # Add custom stats generators. generators.extend(options.generators) if options.enable_semantic_domain_stats: semantic_domain_feature_stats_generators = [ image_stats_generator.ImageStatsGenerator(), natural_language_stats_generator.NLStatsGenerator(), time_stats_generator.TimeStatsGenerator(), ] # Wrap semantic domain feature stats generators as a separate combiner # stats generator, so that we can apply sampling only for those and other # feature stats generators are not affected by it. generators.append( CombinerFeatureStatsWrapperGenerator( semantic_domain_feature_stats_generators, weight_feature=options.weight_feature, sample_rate=options.semantic_domain_stats_sample_rate)) if options.schema is not None: if _schema_has_sparse_features(options.schema): generators.append( sparse_feature_stats_generator.SparseFeatureStatsGenerator( options.schema)) if options.schema.weighted_feature: generators.append( weighted_feature_stats_generator.WeightedFeatureStatsGenerator( options.schema)) if options.label_feature and not in_memory: # The LiftStatsGenerator is not a CombinerStatsGenerator and therefore # cannot currenty be used for in_memory executions. generators.append( lift_stats_generator.LiftStatsGenerator( y_path=types.FeaturePath([options.label_feature]), schema=options.schema, weight_column_name=options.weight_feature, output_custom_stats=True)) # Replace all CombinerFeatureStatsGenerator with a single # CombinerFeatureStatsWrapperGenerator. feature_generators = [ x for x in generators if isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] if feature_generators: generators = [ x for x in generators if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator) ] + [ CombinerFeatureStatsWrapperGenerator( feature_generators, weight_feature=options.weight_feature) ] if in_memory: for generator in generators: if not isinstance(generator, stats_generator.CombinerStatsGenerator): raise TypeError('Statistics generator used in ' 'generate_statistics_in_memory must ' 'extend CombinerStatsGenerator, found object of ' 'type %s.' % generator.__class__.__name__) return generators
def test_stats_options_json_round_trip(self): generators = [ lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['label']), x_paths=[types.FeaturePath(['feature'])]) ] feature_whitelist = ['a'] schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')]) label_feature = 'label' weight_feature = 'weight' slice_functions = [slicing_util.get_feature_value_slicer({'b': None})] sample_rate = 0.01 num_top_values = 21 frequency_threshold = 2 weighted_frequency_threshold = 2.0 num_rank_histogram_buckets = 1001 num_values_histogram_buckets = 11 num_histogram_buckets = 11 num_quantiles_histogram_buckets = 11 epsilon = 0.02 infer_type_from_schema = True desired_batch_size = 100 enable_semantic_domain_stats = True semantic_domain_stats_sample_rate = 0.1 options = stats_options.StatsOptions( generators=generators, feature_whitelist=feature_whitelist, schema=schema, label_feature=label_feature, weight_feature=weight_feature, slice_functions=slice_functions, sample_rate=sample_rate, num_top_values=num_top_values, frequency_threshold=frequency_threshold, weighted_frequency_threshold=weighted_frequency_threshold, num_rank_histogram_buckets=num_rank_histogram_buckets, num_values_histogram_buckets=num_values_histogram_buckets, num_histogram_buckets=num_histogram_buckets, num_quantiles_histogram_buckets=num_quantiles_histogram_buckets, epsilon=epsilon, infer_type_from_schema=infer_type_from_schema, desired_batch_size=desired_batch_size, enable_semantic_domain_stats=enable_semantic_domain_stats, semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate ) options_json = options.to_json() options = stats_options.StatsOptions.from_json(options_json) self.assertIsNone(options.generators) self.assertEqual(feature_whitelist, options.feature_whitelist) compare.assertProtoEqual(self, schema, options.schema) self.assertEqual(label_feature, options.label_feature) self.assertEqual(weight_feature, options.weight_feature) self.assertIsNone(options.slice_functions) self.assertEqual(sample_rate, options.sample_rate) self.assertEqual(num_top_values, options.num_top_values) self.assertEqual(frequency_threshold, options.frequency_threshold) self.assertEqual(weighted_frequency_threshold, options.weighted_frequency_threshold) self.assertEqual(num_rank_histogram_buckets, options.num_rank_histogram_buckets) self.assertEqual(num_values_histogram_buckets, options.num_values_histogram_buckets) self.assertEqual(num_histogram_buckets, options.num_histogram_buckets) self.assertEqual(num_quantiles_histogram_buckets, options.num_quantiles_histogram_buckets) self.assertEqual(epsilon, options.epsilon) self.assertEqual(infer_type_from_schema, options.infer_type_from_schema) self.assertEqual(desired_batch_size, options.desired_batch_size) self.assertEqual(enable_semantic_domain_stats, options.enable_semantic_domain_stats) self.assertEqual(semantic_domain_stats_sample_rate, options.semantic_domain_stats_sample_rate)
def test_lift_flattened_x_leaf(self): examples = [ pa.Table.from_arrays([ pa.array([['a', 'a'], ['a'], ['b', 'b'], ['a', 'a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y']), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) expected_result = [ text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "string_y" } categorical_cross_stats { lift_series { y_string: "cat" y_count: 2 lift_values { x_string: "b" lift: 2.0 x_count: 1 x_and_y_count: 1 } lift_values { x_string: "a" lift: 0.66666698 x_count: 3 x_and_y_count: 1 } } lift_series { y_string: "dog" y_count: 2 lift_values { x_string: "a" lift: 1.33333301544 x_count: 3 x_and_y_count: 2 } lift_values { x_string: "b" lift: 0.0 x_count: 1 x_and_y_count: 0 } } } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['string_y'])) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_lift_flattened_x(self): examples = [ pa.Table.from_arrays([ pa.array([[{ 'docs': ['a', 'b'] }, { 'docs': ['a'] }, { 'docs': ['c'] }], [{ 'docs': ['a', 'b'] }]]), pa.array([['pos'], ['neg']]), ], ['doc_set', 'string_y']), ] schema = text_format.Parse( """ feature { name: 'doc_set' struct_domain { feature { name: 'docs' type: BYTES } } type: STRUCT } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) expected_result = [ text_format.Parse( """ cross_features { path_x { step: 'doc_set' step: 'docs' } path_y { step: "string_y" } categorical_cross_stats { lift_series { y_string: "neg" y_count: 1 lift_values { x_string: "a" lift: 1.0 x_count: 2 x_and_y_count: 1 } lift_values { x_string: "b" lift: 1.0 x_count: 2 x_and_y_count: 1 } lift_values { x_string: "c" lift: 0.0 x_count: 1 x_and_y_count: 0 } } lift_series { y_string: "pos" y_count: 1 lift_values { x_string: "c" lift: 2.0 x_count: 1 x_and_y_count: 1 } lift_values { x_string: "a" lift: 1.0 x_count: 2 x_and_y_count: 1 } lift_values { x_string: "b" lift: 1.0 x_count: 2 x_and_y_count: 1 } } } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['string_y'])) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_lift_y_is_none(self): examples = [ pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([None, [.7], [.4], [.6]]), ], ['categorical_x', 'float_y']), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'float_y' type: FLOAT } """, schema_pb2.Schema()) expected_result = [ text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "float_y" } categorical_cross_stats { lift_series { y_bucket { low_value: -inf high_value: 0.5 } y_count: 1 lift_values { x_string: "b" lift: 4.0 x_count: 1 x_and_y_count: 1 } lift_values { x_string: "a" lift: 0.0 x_count: 3 x_and_y_count: 0 } } lift_series { y_bucket { low_value: 0.5 high_value: inf } y_count: 2 lift_values { x_string: "a" lift: 1.33333301544 x_count: 3 x_and_y_count: 2 } lift_values { x_string: "b" lift: 0.0 x_count: 1 x_and_y_count: 0 } } } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['float_y']), y_boundaries=[0.5]) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_lift_int_y(self): examples = [ pa.Table.from_arrays([ pa.array([[11], [11], [22], [11]]), pa.array([[1], [0], [1], [0]]), ], ['categorical_x', 'int_y']), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'int_y' type: INT int_domain { is_categorical: true } } """, schema_pb2.Schema()) expected_result = [ text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "int_y" } categorical_cross_stats { lift_series { y_int: 0 y_count: 2 lift_values { x_int: 11 lift: 1.333333 x_count: 3 x_and_y_count: 2 } lift_values { x_int: 22 lift: 0.0 x_count: 1 x_and_y_count: 0 } } lift_series { y_int: 1 y_count: 2 lift_values { x_int: 22 lift: 2.0 x_count: 1 x_and_y_count: 1 } lift_values { x_int: 11 lift: 0.66666698 x_count: 3 x_and_y_count: 1 } } } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['int_y'])) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_lift_slice_aware(self): examples = [ ('slice1', pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y'])), ('slice2', pa.Table.from_arrays([ pa.array([['a'], ['a'], ['a']]), pa.array([['cat'], ['dog'], ['dog']]), ], ['categorical_x', 'string_y'])), ('slice1', pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y'])), ('slice2', pa.Table.from_arrays([ pa.array([None, None, None, None], type=pa.null()), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x', 'string_y'])), ] schema = text_format.Parse( """ feature { name: 'categorical_x' type: BYTES } feature { name: 'string_y' type: BYTES } """, schema_pb2.Schema()) expected_result = [ ('slice1', text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "string_y" } categorical_cross_stats { lift_series { y_string: "cat" y_count: 4 lift_values { x_string: "b" lift: 2.0 x_count: 2 x_and_y_count: 2 } lift_values { x_string: "a" lift: 0.666666984558 x_count: 6 x_and_y_count: 2 } } lift_series { y_string: "dog" y_count: 4 lift_values { x_string: "a" lift: 1.33333301544 x_count: 6 x_and_y_count: 4 } lift_values { x_string: "b" lift: 0.0 x_count: 2 x_and_y_count: 0 } } } }""", statistics_pb2.DatasetFeatureStatistics())), ('slice2', text_format.Parse( """ cross_features { path_x { step: "categorical_x" } path_y { step: "string_y" } categorical_cross_stats { lift_series { y_string: "cat" y_count: 3 lift_values { x_string: "a" lift: 0.777778029441 x_count: 3 x_and_y_count: 1 } } lift_series { y_string: "dog" y_count: 4 lift_values { x_string: "a" lift: 1.16666698455 x_count: 3 x_and_y_count: 2 } } } }""", statistics_pb2.DatasetFeatureStatistics())), ] generator = lift_stats_generator.LiftStatsGenerator( schema=schema, y_path=types.FeaturePath(['string_y'])) self.assertSlicingAwareTransformOutputEqual(examples, generator, expected_result)
def test_lift_with_no_schema_or_x_path(self): with self.assertRaisesRegex(ValueError, r'Either a schema or x_paths must be provided'): lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['int_y']))
def test_lift_provided_x_no_schema(self): examples = [ pa.Table.from_arrays([ pa.array([['a'], ['a'], ['b'], ['a']]), pa.array([['x'], ['x'], ['y'], ['x']]), pa.array([['cat'], ['dog'], ['cat'], ['dog']]), ], ['categorical_x1', 'categorical_x2', 'string_y']), ] expected_result = [ text_format.Parse(""" cross_features { path_x { step: "categorical_x1" } path_y { step: "string_y" } categorical_cross_stats { lift { lift_series { y_string: "cat" y_count: 2 lift_values { x_string: "b" lift: 2.0 x_count: 1 x_and_y_count: 1 } lift_values { x_string: "a" lift: 0.6666667 x_count: 3 x_and_y_count: 1 } } lift_series { y_string: "dog" y_count: 2 lift_values { x_string: "a" lift: 1.3333333 x_count: 3 x_and_y_count: 2 } lift_values { x_string: "b" lift: 0.0 x_count: 1 x_and_y_count: 0 } } } } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['string_y']), x_paths=[types.FeaturePath(['categorical_x1'])]) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_stats_options_json_round_trip(self): generators = [ lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['label']), x_paths=[types.FeaturePath(['feature'])]) ] feature_allowlist = ['a'] schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')]) vocab_paths = {'a': '/path/to/a'} label_feature = 'label' weight_feature = 'weight' slice_functions = [slicing_util.get_feature_value_slicer({'b': None})] sample_rate = 0.01 num_top_values = 21 frequency_threshold = 2 weighted_frequency_threshold = 2.0 num_rank_histogram_buckets = 1001 num_values_histogram_buckets = 11 num_histogram_buckets = 11 num_quantiles_histogram_buckets = 11 epsilon = 0.02 infer_type_from_schema = True desired_batch_size = 100 enable_semantic_domain_stats = True semantic_domain_stats_sample_rate = 0.1 per_feature_weight_override = {types.FeaturePath(['a']): 'w'} add_default_generators = True use_sketch_based_topk_uniques = True options = stats_options.StatsOptions( generators=generators, feature_allowlist=feature_allowlist, schema=schema, vocab_paths=vocab_paths, label_feature=label_feature, weight_feature=weight_feature, experimental_slice_functions=slice_functions, sample_rate=sample_rate, num_top_values=num_top_values, frequency_threshold=frequency_threshold, weighted_frequency_threshold=weighted_frequency_threshold, num_rank_histogram_buckets=num_rank_histogram_buckets, num_values_histogram_buckets=num_values_histogram_buckets, num_histogram_buckets=num_histogram_buckets, num_quantiles_histogram_buckets=num_quantiles_histogram_buckets, epsilon=epsilon, infer_type_from_schema=infer_type_from_schema, desired_batch_size=desired_batch_size, enable_semantic_domain_stats=enable_semantic_domain_stats, semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate, per_feature_weight_override=per_feature_weight_override, add_default_generators=add_default_generators, experimental_use_sketch_based_topk_uniques=use_sketch_based_topk_uniques ) options_json = options.to_json() options = stats_options.StatsOptions.from_json(options_json) self.assertIsNone(options.generators) self.assertEqual(feature_allowlist, options.feature_allowlist) compare.assertProtoEqual(self, schema, options.schema) self.assertEqual(vocab_paths, options.vocab_paths) self.assertEqual(label_feature, options.label_feature) self.assertEqual(weight_feature, options.weight_feature) self.assertIsNone(options.experimental_slice_functions) self.assertEqual(sample_rate, options.sample_rate) self.assertEqual(num_top_values, options.num_top_values) self.assertEqual(frequency_threshold, options.frequency_threshold) self.assertEqual(weighted_frequency_threshold, options.weighted_frequency_threshold) self.assertEqual(num_rank_histogram_buckets, options.num_rank_histogram_buckets) self.assertEqual(num_values_histogram_buckets, options.num_values_histogram_buckets) self.assertEqual(num_histogram_buckets, options.num_histogram_buckets) self.assertEqual(num_quantiles_histogram_buckets, options.num_quantiles_histogram_buckets) self.assertEqual(epsilon, options.epsilon) self.assertEqual(infer_type_from_schema, options.infer_type_from_schema) self.assertEqual(desired_batch_size, options.desired_batch_size) self.assertEqual(enable_semantic_domain_stats, options.enable_semantic_domain_stats) self.assertEqual(semantic_domain_stats_sample_rate, options.semantic_domain_stats_sample_rate) self.assertEqual(per_feature_weight_override, options._per_feature_weight_override) self.assertEqual(add_default_generators, options.add_default_generators) self.assertEqual(use_sketch_based_topk_uniques, options.experimental_use_sketch_based_topk_uniques)