def __init__( self, # pylint: disable=useless-super-delegation name='TopKUniquesCombinerStatsGenerator', schema=None, weight_feature=None, num_top_values=2, num_rank_histogram_buckets=1000): """Initializes a top-k and uniques combiner statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. weight_feature: Feature name whose numeric value represents the weight of an example. None if there is no weight feature. num_top_values: The number of most frequent feature values to keep for string features. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. """ super(TopKUniquesCombinerStatsGenerator, self).__init__(name, schema) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._weight_feature = weight_feature self._num_top_values = num_top_values self._num_rank_histogram_buckets = num_rank_histogram_buckets
def __init__(self, schema, weight_feature, num_top_values, frequency_threshold, weighted_frequency_threshold, num_rank_histogram_buckets): """Initializes _ComputeTopKUniquesStats. Args: schema: An schema for the dataset. None if no schema is available. weight_feature: Feature name whose numeric value represents the weight of an example. None if there is no weight feature. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples the most frequent values must be present in. weighted_frequency_threshold: The minimum weighted number of examples the most frequent weighted values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. """ self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._weight_feature = weight_feature self._num_top_values = num_top_values self._frequency_threshold = frequency_threshold self._weighted_frequency_threshold = weighted_frequency_threshold self._num_rank_histogram_buckets = num_rank_histogram_buckets
def __init__(self, schema: schema_pb2.Schema, example_weight_map: ExampleWeightMap, num_top_values: int, frequency_threshold: int, weighted_frequency_threshold: float, num_rank_histogram_buckets: int): """Initializes _ComputeTopKUniquesStats. Args: schema: An schema for the dataset. None if no schema is available. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: The minimum number of examples the most frequent values must be present in. weighted_frequency_threshold: The minimum weighted number of examples the most frequent weighted values must be present in. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. """ self._bytes_features = frozenset( schema_util.get_bytes_features(schema) if schema else []) self._categorical_features = frozenset( schema_util.get_categorical_numeric_features(schema) if schema else []) self._example_weight_map = example_weight_map self._num_top_values = num_top_values self._frequency_threshold = frequency_threshold self._weighted_frequency_threshold = weighted_frequency_threshold self._num_rank_histogram_buckets = num_rank_histogram_buckets
def test_get_categorical_numeric_features(self): schema = text_format.Parse( """ feature { name: "fa" type: INT int_domain { is_categorical: true } } feature { name: "fb" type: BYTES } feature { name: "fc" type: FLOAT } feature { name: "fc" type: INT bool_domain{ name: "fc_bool_domain" } } """, schema_pb2.Schema()) self.assertEqual(schema_util.get_categorical_numeric_features(schema), ['fa', 'fc'])
def __init__( self, # pylint: disable=useless-super-delegation name='CommonStatsGenerator', schema=None, weight_feature=None, num_values_histogram_buckets=10, epsilon=0.01): """Initializes a common statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. weight_feature: An optional feature name whose numeric value represents the weight of an example. num_values_histogram_buckets: An optional number of buckets in a quantiles histogram for the number of values per Feature, which is stored in CommonStatistics.num_values_histogram. epsilon: An optional error tolerance for the computation of quantiles, typically a small fraction close to zero (e.g. 0.01). Higher values of epsilon increase the quantile approximation, and hence result in more unequal buckets, but could improve performance, and resource consumption. """ super(CommonStatsGenerator, self).__init__(name, schema) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._weight_feature = weight_feature self._num_values_histogram_buckets = num_values_histogram_buckets # Initialize quantiles combiner. self._quantiles_combiner = quantiles_util.QuantilesCombiner( self._num_values_histogram_buckets, epsilon)
def __init__( self, # pylint: disable=useless-super-delegation name='NumericStatsGenerator', schema=None, num_histogram_buckets=10, num_quantiles_histogram_buckets=10, epsilon=0.01): """Initializes a numeric statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. num_histogram_buckets: An optional number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: An optional number of buckets in a quantiles NumericStatistics.histogram. epsilon: An optional error tolerance for the computation of quantiles, typically a small fraction close to zero (e.g. 0.01). Higher values of epsilon increase the quantile approximation, and hence result in more unequal buckets, but could improve performance, and resource consumption. """ super(NumericStatsGenerator, self).__init__(name) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._num_histogram_buckets = num_histogram_buckets self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets num_buckets = max( self._num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * self._num_histogram_buckets) # Initialize quantiles combiner. self._quantiles_combiner = quantiles_util.QuantilesCombiner( num_buckets, epsilon)
def __init__( self, # pylint: disable=useless-super-delegation name: Text = 'TopKUniquesCombinerStatsGenerator', schema: Optional[schema_pb2.Schema] = None, weight_feature: Optional[types.FeatureName] = None, num_top_values: int = 2, frequency_threshold: int = 1, weighted_frequency_threshold: float = 1.0, num_rank_histogram_buckets: int = 1000) -> None: """Initializes a top-k and uniques combiner statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. weight_feature: Feature name whose numeric value represents the weight of an example. None if there is no weight feature. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: An optional minimum number of examples the most frequent values must be present in (defaults to 1). weighted_frequency_threshold: An optional minimum weighted number of examples the most frequent weighted values must be present in (defaults to 1.0). num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. """ super(TopKUniquesCombinerStatsGenerator, self).__init__(name, schema) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._weight_feature = weight_feature self._num_top_values = num_top_values self._frequency_threshold = frequency_threshold self._weighted_frequency_threshold = weighted_frequency_threshold self._num_rank_histogram_buckets = num_rank_histogram_buckets
def __init__(self, schema): """Initializes unique stats generator ptransform. Args: schema: An schema for the dataset. None if no schema is available. """ self._categorical_features = set( schema_util.get_categorical_numeric_features(schema) if schema else [])
def __init__(self, name: Text = "TopKUniquesSketchStatsGenerator", schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_top_values: int = 2, num_rank_histogram_buckets: int = 128, frequency_threshold: int = 1, weighted_frequency_threshold: float = 1.0, num_misragries_buckets: int = 128, num_kmv_buckets: int = 128, store_output_in_custom_stats: bool = False): """Initializes a top-k and uniques sketch combiner statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_top_values: The number of most frequent feature values to keep for string features. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. frequency_threshold: An optional minimum number of examples the most frequent values must be present in (defaults to 1). weighted_frequency_threshold: An optional minimum weighted number of examples the most frequent weighted values must be present in (defaults to 1.0). num_misragries_buckets: Number of buckets to use for MisraGries sketch. num_kmv_buckets: Number of buckets to use for KMV sketch. store_output_in_custom_stats: Boolean to indicate if the output stats need to be stored in custom stats. If False, the output is stored in `uniques` and `rank_histogram` fields. """ super( TopKUniquesSketchStatsGenerator, self, ).__init__(name, schema) self._num_misragries_buckets = num_misragries_buckets self._num_kmv_buckets = num_kmv_buckets self._num_top_values = num_top_values self._example_weight_map = example_weight_map self._num_rank_histogram_buckets = num_rank_histogram_buckets self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._bytes_features = frozenset( schema_util.get_bytes_features(schema) if schema else []) self._frequency_threshold = frequency_threshold self._weighted_frequency_threshold = weighted_frequency_threshold self._store_output_in_custom_stats = store_output_in_custom_stats self._num_top_values_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_top_values") self._num_rank_histogram_buckets_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_rank_histogram_buckets") self._num_mg_buckets_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_mg_buckets") self._num_kmv_buckets_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_kmv_buckets")
def __init__( self, # pylint: disable=useless-super-delegation name: Text = 'BasicStatsGenerator', schema: Optional[schema_pb2.Schema] = None, weight_feature: Optional[types.FeatureName] = None, num_values_histogram_buckets: Optional[int] = 10, num_histogram_buckets: Optional[int] = 10, num_quantiles_histogram_buckets: Optional[int] = 10, epsilon: Optional[float] = 0.01) -> None: """Initializes basic statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. weight_feature: An optional feature name whose numeric value represents the weight of an example. num_values_histogram_buckets: An optional number of buckets in a quantiles histogram for the number of values per Feature, which is stored in CommonStatistics.num_values_histogram. num_histogram_buckets: An optional number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: An optional number of buckets in a quantiles NumericStatistics.histogram. epsilon: An optional error tolerance for the computation of quantiles, typically a small fraction close to zero (e.g. 0.01). Higher values of epsilon increase the quantile approximation, and hence result in more unequal buckets, but could improve performance, and resource consumption. """ super(BasicStatsGenerator, self).__init__(name, schema) self._bytes_features = set( schema_util.get_bytes_features(schema) if schema else []) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._weight_feature = weight_feature self._num_values_histogram_buckets = num_values_histogram_buckets # Initialize quantiles combiner for histogram over number of values. self._num_values_quantiles_combiner = quantiles_util.QuantilesCombiner( self._num_values_histogram_buckets, epsilon) self._num_histogram_buckets = num_histogram_buckets self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets num_buckets = max( self._num_quantiles_histogram_buckets, _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM * self._num_histogram_buckets) assert num_buckets % self._num_quantiles_histogram_buckets == 0 # Initialize quantiles combiner for histogram over feature values. self._values_quantiles_combiner = quantiles_util.QuantilesCombiner( num_buckets, epsilon, has_weights=True)
def __init__( self, # pylint: disable=useless-super-delegation name='StringStatsGenerator', schema=None): """Initializes a string statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. """ super(StringStatsGenerator, self).__init__(name, schema) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else [])
def __init__( self, # pylint: disable=useless-super-delegation name: Text = 'BasicStatsGenerator', schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_values_histogram_buckets: Optional[int] = 10, num_histogram_buckets: Optional[int] = 10, num_quantiles_histogram_buckets: Optional[int] = 10, epsilon: Optional[float] = 0.01) -> None: """Initializes basic statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_values_histogram_buckets: An optional number of buckets in a quantiles histogram for the number of values per Feature, which is stored in CommonStatistics.num_values_histogram. num_histogram_buckets: An optional number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: An optional number of buckets in a quantiles NumericStatistics.histogram. epsilon: An optional error tolerance for the computation of quantiles, typically a small fraction close to zero (e.g. 0.01). Higher values of epsilon increase the quantile approximation, and hence result in more unequal buckets, but could improve performance, and resource consumption. """ super(BasicStatsGenerator, self).__init__(name, schema) self._bytes_features = set( schema_util.get_bytes_features(schema) if schema else []) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema) if schema else []) self._example_weight_map = example_weight_map self._num_values_histogram_buckets = num_values_histogram_buckets self._num_histogram_buckets = num_histogram_buckets self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets self._make_quantiles_sketch_fn = lambda: sketches.QuantilesSketch( # pylint: disable=g-long-lambda eps=epsilon, max_num_elements=1 << 32, num_streams=1)