def test_lift_no_categorical_features(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([[1.0], [2.0], [3.0], [4.0]]),
             pa.array([[1], [0], [1], [0]]),
         ], ['continous_x', 'int_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'continuous_x'
       type: FLOAT
     }
     feature {
       name: 'int_y'
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     expected_result = []
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['int_y']))
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
 def test_lift_min_x_count_filters_all(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([['a'], ['a'], ['b'], ['a']]),
             pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
         ], ['categorical_x', 'string_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'string_y'
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_result = []
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema,
         y_path=types.FeaturePath(['string_y']),
         min_x_count=4)
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
Example #3
0
 def test_stats_options_with_generators_to_json(self):
     generators = [
         lift_stats_generator.LiftStatsGenerator(
             schema=None,
             y_path=types.FeaturePath(['label']),
             x_paths=[types.FeaturePath(['feature'])])
     ]
     options = stats_options.StatsOptions(generators=generators)
     with self.assertRaisesRegex(ValueError,
                                 'StatsOptions cannot be converted'):
         options.to_json()
 def test_lift_int_y_with_no_boundaries(self):
   schema = text_format.Parse(
       """
       feature {
         name: 'categorical_x'
         type: BYTES
       }
       feature {
         name: 'int_y'
         type: INT
       }
       """, schema_pb2.Schema())
   with self.assertRaisesRegex(ValueError,
                               r'Boundaries must be provided with a non-'
                               'categorical y_path.*'):
     lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['int_y']))
 def test_lift_string_y_with_boundaries(self):
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'string_y'
       type: BYTES
     }
     """, schema_pb2.Schema())
     with self.assertRaisesRegex(
             ValueError, r'Boundaries cannot be applied to a '
             'categorical y_path.*'):
         lift_stats_generator.LiftStatsGenerator(schema,
                                                 y_path=types.FeaturePath(
                                                     ['string_y']),
                                                 y_boundaries=[1, 2, 3])
Example #6
0
def get_generators(options: stats_options.StatsOptions,
                   in_memory: bool = False
                  ) -> List[stats_generator.StatsGenerator]:
  """Initializes the list of stats generators, including custom generators.

  Args:
    options: A StatsOptions object.
    in_memory: Whether the generators will be used to generate statistics in
      memory (True) or using Beam (False).

  Returns:
    A list of stats generator objects.
  """
  generators = _get_default_generators(options, in_memory)
  if options.generators:
    # Add custom stats generators.
    generators.extend(options.generators)
  if options.enable_semantic_domain_stats:
    semantic_domain_feature_stats_generators = [
        image_stats_generator.ImageStatsGenerator(),
        natural_language_stats_generator.NLStatsGenerator(),
        time_stats_generator.TimeStatsGenerator(),
    ]
    # Wrap semantic domain feature stats generators as a separate combiner
    # stats generator, so that we can apply sampling only for those and other
    # feature stats generators are not affected by it.
    generators.append(
        CombinerFeatureStatsWrapperGenerator(
            semantic_domain_feature_stats_generators,
            weight_feature=options.weight_feature,
            sample_rate=options.semantic_domain_stats_sample_rate))
  if options.schema is not None:
    if _schema_has_sparse_features(options.schema):
      generators.append(
          sparse_feature_stats_generator.SparseFeatureStatsGenerator(
              options.schema))
    if options.schema.weighted_feature:
      generators.append(
          weighted_feature_stats_generator.WeightedFeatureStatsGenerator(
              options.schema))
    if options.label_feature and not in_memory:
      # The LiftStatsGenerator is not a CombinerStatsGenerator and therefore
      # cannot currenty be used for in_memory executions.
      generators.append(
          lift_stats_generator.LiftStatsGenerator(
              y_path=types.FeaturePath([options.label_feature]),
              schema=options.schema,
              weight_column_name=options.weight_feature,
              output_custom_stats=True))

  # Replace all CombinerFeatureStatsGenerator with a single
  # CombinerFeatureStatsWrapperGenerator.
  feature_generators = [
      x for x in generators
      if isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
  ]
  if feature_generators:
    generators = [
        x for x in generators
        if not isinstance(x, stats_generator.CombinerFeatureStatsGenerator)
    ] + [
        CombinerFeatureStatsWrapperGenerator(
            feature_generators, weight_feature=options.weight_feature)
    ]
  if in_memory:
    for generator in generators:
      if not isinstance(generator, stats_generator.CombinerStatsGenerator):
        raise TypeError('Statistics generator used in '
                        'generate_statistics_in_memory must '
                        'extend CombinerStatsGenerator, found object of '
                        'type %s.' % generator.__class__.__name__)
  return generators
Example #7
0
    def test_stats_options_json_round_trip(self):
        generators = [
            lift_stats_generator.LiftStatsGenerator(
                schema=None,
                y_path=types.FeaturePath(['label']),
                x_paths=[types.FeaturePath(['feature'])])
        ]
        feature_whitelist = ['a']
        schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')])
        label_feature = 'label'
        weight_feature = 'weight'
        slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
        sample_rate = 0.01
        num_top_values = 21
        frequency_threshold = 2
        weighted_frequency_threshold = 2.0
        num_rank_histogram_buckets = 1001
        num_values_histogram_buckets = 11
        num_histogram_buckets = 11
        num_quantiles_histogram_buckets = 11
        epsilon = 0.02
        infer_type_from_schema = True
        desired_batch_size = 100
        enable_semantic_domain_stats = True
        semantic_domain_stats_sample_rate = 0.1

        options = stats_options.StatsOptions(
            generators=generators,
            feature_whitelist=feature_whitelist,
            schema=schema,
            label_feature=label_feature,
            weight_feature=weight_feature,
            slice_functions=slice_functions,
            sample_rate=sample_rate,
            num_top_values=num_top_values,
            frequency_threshold=frequency_threshold,
            weighted_frequency_threshold=weighted_frequency_threshold,
            num_rank_histogram_buckets=num_rank_histogram_buckets,
            num_values_histogram_buckets=num_values_histogram_buckets,
            num_histogram_buckets=num_histogram_buckets,
            num_quantiles_histogram_buckets=num_quantiles_histogram_buckets,
            epsilon=epsilon,
            infer_type_from_schema=infer_type_from_schema,
            desired_batch_size=desired_batch_size,
            enable_semantic_domain_stats=enable_semantic_domain_stats,
            semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate
        )

        options_json = options.to_json()
        options = stats_options.StatsOptions.from_json(options_json)

        self.assertIsNone(options.generators)
        self.assertEqual(feature_whitelist, options.feature_whitelist)
        compare.assertProtoEqual(self, schema, options.schema)
        self.assertEqual(label_feature, options.label_feature)
        self.assertEqual(weight_feature, options.weight_feature)
        self.assertIsNone(options.slice_functions)
        self.assertEqual(sample_rate, options.sample_rate)
        self.assertEqual(num_top_values, options.num_top_values)
        self.assertEqual(frequency_threshold, options.frequency_threshold)
        self.assertEqual(weighted_frequency_threshold,
                         options.weighted_frequency_threshold)
        self.assertEqual(num_rank_histogram_buckets,
                         options.num_rank_histogram_buckets)
        self.assertEqual(num_values_histogram_buckets,
                         options.num_values_histogram_buckets)
        self.assertEqual(num_histogram_buckets, options.num_histogram_buckets)
        self.assertEqual(num_quantiles_histogram_buckets,
                         options.num_quantiles_histogram_buckets)
        self.assertEqual(epsilon, options.epsilon)
        self.assertEqual(infer_type_from_schema,
                         options.infer_type_from_schema)
        self.assertEqual(desired_batch_size, options.desired_batch_size)
        self.assertEqual(enable_semantic_domain_stats,
                         options.enable_semantic_domain_stats)
        self.assertEqual(semantic_domain_stats_sample_rate,
                         options.semantic_domain_stats_sample_rate)
 def test_lift_flattened_x_leaf(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([['a', 'a'], ['a'], ['b', 'b'], ['a', 'a']]),
             pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
         ], ['categorical_x', 'string_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'string_y'
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_result = [
         text_format.Parse(
             """
         cross_features {
           path_x {
             step: "categorical_x"
           }
           path_y {
             step: "string_y"
           }
           categorical_cross_stats {
             lift_series {
               y_string: "cat"
               y_count: 2
               lift_values {
                 x_string: "b"
                 lift: 2.0
                 x_count: 1
                 x_and_y_count: 1
               }
               lift_values {
                 x_string: "a"
                 lift: 0.66666698
                 x_count: 3
                 x_and_y_count: 1
               }
             }
             lift_series {
               y_string: "dog"
               y_count: 2
               lift_values {
                 x_string: "a"
                 lift: 1.33333301544
                 x_count: 3
                 x_and_y_count: 2
               }
               lift_values {
                 x_string: "b"
                 lift: 0.0
                 x_count: 1
                 x_and_y_count: 0
               }
             }
          }
         }""", statistics_pb2.DatasetFeatureStatistics()),
     ]
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['string_y']))
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
 def test_lift_flattened_x(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([[{
                 'docs': ['a', 'b']
             }, {
                 'docs': ['a']
             }, {
                 'docs': ['c']
             }], [{
                 'docs': ['a', 'b']
             }]]),
             pa.array([['pos'], ['neg']]),
         ], ['doc_set', 'string_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'doc_set'
       struct_domain {
         feature {
           name: 'docs'
           type: BYTES
         }
       }
       type: STRUCT
     }
     feature {
       name: 'string_y'
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_result = [
         text_format.Parse(
             """
         cross_features {
           path_x {
             step: 'doc_set'
             step: 'docs'
           }
           path_y {
             step: "string_y"
           }
           categorical_cross_stats {
             lift_series {
               y_string: "neg"
               y_count: 1
               lift_values {
                 x_string: "a"
                 lift: 1.0
                 x_count: 2
                 x_and_y_count: 1
               }
               lift_values {
                 x_string: "b"
                 lift: 1.0
                 x_count: 2
                 x_and_y_count: 1
               }
               lift_values {
                 x_string: "c"
                 lift: 0.0
                 x_count: 1
                 x_and_y_count: 0
               }
             }
             lift_series {
               y_string: "pos"
               y_count: 1
               lift_values {
                 x_string: "c"
                 lift: 2.0
                 x_count: 1
                 x_and_y_count: 1
               }
               lift_values {
                 x_string: "a"
                 lift: 1.0
                 x_count: 2
                 x_and_y_count: 1
               }
               lift_values {
                 x_string: "b"
                 lift: 1.0
                 x_count: 2
                 x_and_y_count: 1
               }
             }
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
     ]
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['string_y']))
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
 def test_lift_y_is_none(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([['a'], ['a'], ['b'], ['a']]),
             pa.array([None, [.7], [.4], [.6]]),
         ], ['categorical_x', 'float_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'float_y'
       type: FLOAT
     }
     """, schema_pb2.Schema())
     expected_result = [
         text_format.Parse(
             """
         cross_features {
           path_x {
             step: "categorical_x"
           }
           path_y {
             step: "float_y"
           }
           categorical_cross_stats {
             lift_series {
               y_bucket {
                 low_value: -inf
                 high_value: 0.5
               }
               y_count: 1
               lift_values {
                 x_string: "b"
                 lift: 4.0
                 x_count: 1
                 x_and_y_count: 1
               }
               lift_values {
                 x_string: "a"
                 lift: 0.0
                 x_count: 3
                 x_and_y_count: 0
               }
             }
             lift_series {
               y_bucket {
                 low_value: 0.5
                 high_value: inf
               }
               y_count: 2
               lift_values {
                 x_string: "a"
                 lift: 1.33333301544
                 x_count: 3
                 x_and_y_count: 2
               }
               lift_values {
                 x_string: "b"
                 lift: 0.0
                 x_count: 1
                 x_and_y_count: 0
               }
             }
           }
         }""", statistics_pb2.DatasetFeatureStatistics()),
     ]
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema,
         y_path=types.FeaturePath(['float_y']),
         y_boundaries=[0.5])
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
 def test_lift_int_y(self):
     examples = [
         pa.Table.from_arrays([
             pa.array([[11], [11], [22], [11]]),
             pa.array([[1], [0], [1], [0]]),
         ], ['categorical_x', 'int_y']),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'int_y'
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     expected_result = [
         text_format.Parse(
             """
         cross_features {
           path_x {
             step: "categorical_x"
           }
           path_y {
             step: "int_y"
           }
           categorical_cross_stats {
             lift_series {
               y_int: 0
               y_count: 2
               lift_values {
                 x_int: 11
                 lift: 1.333333
                 x_count: 3
                 x_and_y_count: 2
               }
               lift_values {
                 x_int: 22
                 lift: 0.0
                 x_count: 1
                 x_and_y_count: 0
               }
             }
             lift_series {
               y_int: 1
               y_count: 2
               lift_values {
                 x_int: 22
                 lift: 2.0
                 x_count: 1
                 x_and_y_count: 1
               }
               lift_values {
                 x_int: 11
                 lift: 0.66666698
                 x_count: 3
                 x_and_y_count: 1
               }
             }
          }
         }""", statistics_pb2.DatasetFeatureStatistics()),
     ]
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['int_y']))
     self.assertSlicingAwareTransformOutputEqual(
         examples,
         generator,
         expected_result,
         add_default_slice_key_to_input=True,
         add_default_slice_key_to_output=True)
 def test_lift_slice_aware(self):
     examples = [
         ('slice1',
          pa.Table.from_arrays([
              pa.array([['a'], ['a'], ['b'], ['a']]),
              pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
          ], ['categorical_x', 'string_y'])),
         ('slice2',
          pa.Table.from_arrays([
              pa.array([['a'], ['a'], ['a']]),
              pa.array([['cat'], ['dog'], ['dog']]),
          ], ['categorical_x', 'string_y'])),
         ('slice1',
          pa.Table.from_arrays([
              pa.array([['a'], ['a'], ['b'], ['a']]),
              pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
          ], ['categorical_x', 'string_y'])),
         ('slice2',
          pa.Table.from_arrays([
              pa.array([None, None, None, None], type=pa.null()),
              pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
          ], ['categorical_x', 'string_y'])),
     ]
     schema = text_format.Parse(
         """
     feature {
       name: 'categorical_x'
       type: BYTES
     }
     feature {
       name: 'string_y'
       type: BYTES
     }
     """, schema_pb2.Schema())
     expected_result = [
         ('slice1',
          text_format.Parse(
              """
         cross_features {
           path_x {
             step: "categorical_x"
           }
           path_y {
             step: "string_y"
           }
           categorical_cross_stats {
             lift_series {
               y_string: "cat"
               y_count: 4
               lift_values {
                 x_string: "b"
                 lift: 2.0
                 x_count: 2
                 x_and_y_count: 2
               }
               lift_values {
                 x_string: "a"
                 lift: 0.666666984558
                 x_count: 6
                 x_and_y_count: 2
               }
             }
             lift_series {
               y_string: "dog"
               y_count: 4
               lift_values {
                 x_string: "a"
                 lift: 1.33333301544
                 x_count: 6
                 x_and_y_count: 4
               }
               lift_values {
                 x_string: "b"
                 lift: 0.0
                 x_count: 2
                 x_and_y_count: 0
               }
             }
          }
         }""", statistics_pb2.DatasetFeatureStatistics())),
         ('slice2',
          text_format.Parse(
              """
         cross_features {
           path_x {
             step: "categorical_x"
           }
           path_y {
             step: "string_y"
           }
           categorical_cross_stats {
             lift_series {
               y_string: "cat"
               y_count: 3
               lift_values {
                 x_string: "a"
                 lift: 0.777778029441
                 x_count: 3
                 x_and_y_count: 1
               }
             }
             lift_series {
               y_string: "dog"
               y_count: 4
               lift_values {
                 x_string: "a"
                 lift: 1.16666698455
                 x_count: 3
                 x_and_y_count: 2
               }
             }
          }
         }""", statistics_pb2.DatasetFeatureStatistics())),
     ]
     generator = lift_stats_generator.LiftStatsGenerator(
         schema=schema, y_path=types.FeaturePath(['string_y']))
     self.assertSlicingAwareTransformOutputEqual(examples, generator,
                                                 expected_result)
 def test_lift_with_no_schema_or_x_path(self):
   with self.assertRaisesRegex(ValueError,
                               r'Either a schema or x_paths must be provided'):
     lift_stats_generator.LiftStatsGenerator(
         schema=None, y_path=types.FeaturePath(['int_y']))
 def test_lift_provided_x_no_schema(self):
   examples = [
       pa.Table.from_arrays([
           pa.array([['a'], ['a'], ['b'], ['a']]),
           pa.array([['x'], ['x'], ['y'], ['x']]),
           pa.array([['cat'], ['dog'], ['cat'], ['dog']]),
       ], ['categorical_x1', 'categorical_x2', 'string_y']),
   ]
   expected_result = [
       text_format.Parse("""
           cross_features {
             path_x {
               step: "categorical_x1"
             }
             path_y {
               step: "string_y"
             }
             categorical_cross_stats {
               lift {
                 lift_series {
                   y_string: "cat"
                   y_count: 2
                   lift_values {
                     x_string: "b"
                     lift: 2.0
                     x_count: 1
                     x_and_y_count: 1
                   }
                   lift_values {
                     x_string: "a"
                     lift: 0.6666667
                     x_count: 3
                     x_and_y_count: 1
                   }
                 }
                 lift_series {
                   y_string: "dog"
                   y_count: 2
                   lift_values {
                     x_string: "a"
                     lift: 1.3333333
                     x_count: 3
                     x_and_y_count: 2
                   }
                   lift_values {
                     x_string: "b"
                     lift: 0.0
                     x_count: 1
                     x_and_y_count: 0
                   }
                 }
               }
             }
           }""", statistics_pb2.DatasetFeatureStatistics()),
   ]
   generator = lift_stats_generator.LiftStatsGenerator(
       schema=None,
       y_path=types.FeaturePath(['string_y']),
       x_paths=[types.FeaturePath(['categorical_x1'])])
   self.assertSlicingAwareTransformOutputEqual(
       examples,
       generator,
       expected_result,
       add_default_slice_key_to_input=True,
       add_default_slice_key_to_output=True)
Example #15
0
  def test_stats_options_json_round_trip(self):
    generators = [
        lift_stats_generator.LiftStatsGenerator(
            schema=None,
            y_path=types.FeaturePath(['label']),
            x_paths=[types.FeaturePath(['feature'])])
    ]
    feature_allowlist = ['a']
    schema = schema_pb2.Schema(feature=[schema_pb2.Feature(name='f')])
    vocab_paths = {'a': '/path/to/a'}
    label_feature = 'label'
    weight_feature = 'weight'
    slice_functions = [slicing_util.get_feature_value_slicer({'b': None})]
    sample_rate = 0.01
    num_top_values = 21
    frequency_threshold = 2
    weighted_frequency_threshold = 2.0
    num_rank_histogram_buckets = 1001
    num_values_histogram_buckets = 11
    num_histogram_buckets = 11
    num_quantiles_histogram_buckets = 11
    epsilon = 0.02
    infer_type_from_schema = True
    desired_batch_size = 100
    enable_semantic_domain_stats = True
    semantic_domain_stats_sample_rate = 0.1
    per_feature_weight_override = {types.FeaturePath(['a']): 'w'}
    add_default_generators = True
    use_sketch_based_topk_uniques = True

    options = stats_options.StatsOptions(
        generators=generators,
        feature_allowlist=feature_allowlist,
        schema=schema,
        vocab_paths=vocab_paths,
        label_feature=label_feature,
        weight_feature=weight_feature,
        experimental_slice_functions=slice_functions,
        sample_rate=sample_rate,
        num_top_values=num_top_values,
        frequency_threshold=frequency_threshold,
        weighted_frequency_threshold=weighted_frequency_threshold,
        num_rank_histogram_buckets=num_rank_histogram_buckets,
        num_values_histogram_buckets=num_values_histogram_buckets,
        num_histogram_buckets=num_histogram_buckets,
        num_quantiles_histogram_buckets=num_quantiles_histogram_buckets,
        epsilon=epsilon,
        infer_type_from_schema=infer_type_from_schema,
        desired_batch_size=desired_batch_size,
        enable_semantic_domain_stats=enable_semantic_domain_stats,
        semantic_domain_stats_sample_rate=semantic_domain_stats_sample_rate,
        per_feature_weight_override=per_feature_weight_override,
        add_default_generators=add_default_generators,
        experimental_use_sketch_based_topk_uniques=use_sketch_based_topk_uniques
    )

    options_json = options.to_json()
    options = stats_options.StatsOptions.from_json(options_json)

    self.assertIsNone(options.generators)
    self.assertEqual(feature_allowlist, options.feature_allowlist)
    compare.assertProtoEqual(self, schema, options.schema)
    self.assertEqual(vocab_paths, options.vocab_paths)
    self.assertEqual(label_feature, options.label_feature)
    self.assertEqual(weight_feature, options.weight_feature)
    self.assertIsNone(options.experimental_slice_functions)
    self.assertEqual(sample_rate, options.sample_rate)
    self.assertEqual(num_top_values, options.num_top_values)
    self.assertEqual(frequency_threshold, options.frequency_threshold)
    self.assertEqual(weighted_frequency_threshold,
                     options.weighted_frequency_threshold)
    self.assertEqual(num_rank_histogram_buckets,
                     options.num_rank_histogram_buckets)
    self.assertEqual(num_values_histogram_buckets,
                     options.num_values_histogram_buckets)
    self.assertEqual(num_histogram_buckets, options.num_histogram_buckets)
    self.assertEqual(num_quantiles_histogram_buckets,
                     options.num_quantiles_histogram_buckets)
    self.assertEqual(epsilon, options.epsilon)
    self.assertEqual(infer_type_from_schema, options.infer_type_from_schema)
    self.assertEqual(desired_batch_size, options.desired_batch_size)
    self.assertEqual(enable_semantic_domain_stats,
                     options.enable_semantic_domain_stats)
    self.assertEqual(semantic_domain_stats_sample_rate,
                     options.semantic_domain_stats_sample_rate)
    self.assertEqual(per_feature_weight_override,
                     options._per_feature_weight_override)
    self.assertEqual(add_default_generators, options.add_default_generators)
    self.assertEqual(use_sketch_based_topk_uniques,
                     options.experimental_use_sketch_based_topk_uniques)