Exemple #1
0
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name='TopKUniquesCombinerStatsGenerator',
            schema=None,
            weight_feature=None,
            num_top_values=2,
            num_rank_histogram_buckets=1000):
        """Initializes a top-k and uniques combiner statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      weight_feature: Feature name whose numeric value represents the weight of
        an example. None if there is no weight feature.
      num_top_values: The number of most frequent feature values to keep for
        string features.
      num_rank_histogram_buckets: The number of buckets in the rank histogram
        for string features.
    """
        super(TopKUniquesCombinerStatsGenerator, self).__init__(name, schema)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_top_values = num_top_values
        self._num_rank_histogram_buckets = num_rank_histogram_buckets
Exemple #2
0
    def __init__(self, schema, weight_feature, num_top_values,
                 frequency_threshold, weighted_frequency_threshold,
                 num_rank_histogram_buckets):
        """Initializes _ComputeTopKUniquesStats.

    Args:
      schema: An schema for the dataset. None if no schema is available.
      weight_feature: Feature name whose numeric value represents the weight
          of an example. None if there is no weight feature.
      num_top_values: The number of most frequent feature values to keep for
          string features.
      frequency_threshold: The minimum number of examples the most frequent
          values must be present in.
      weighted_frequency_threshold: The minimum weighted number of examples the
          most frequent weighted values must be present in.
      num_rank_histogram_buckets: The number of buckets in the rank histogram
          for string features.
    """
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_top_values = num_top_values
        self._frequency_threshold = frequency_threshold
        self._weighted_frequency_threshold = weighted_frequency_threshold
        self._num_rank_histogram_buckets = num_rank_histogram_buckets
  def __init__(self, schema: schema_pb2.Schema,
               example_weight_map: ExampleWeightMap, num_top_values: int,
               frequency_threshold: int, weighted_frequency_threshold: float,
               num_rank_histogram_buckets: int):
    """Initializes _ComputeTopKUniquesStats.

    Args:
      schema: An schema for the dataset. None if no schema is available.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
          corresponding weight column.
      num_top_values: The number of most frequent feature values to keep for
          string features.
      frequency_threshold: The minimum number of examples the most frequent
          values must be present in.
      weighted_frequency_threshold: The minimum weighted number of examples the
          most frequent weighted values must be present in.
      num_rank_histogram_buckets: The number of buckets in the rank histogram
          for string features.
    """
    self._bytes_features = frozenset(
        schema_util.get_bytes_features(schema) if schema else [])
    self._categorical_features = frozenset(
        schema_util.get_categorical_numeric_features(schema) if schema else [])
    self._example_weight_map = example_weight_map
    self._num_top_values = num_top_values
    self._frequency_threshold = frequency_threshold
    self._weighted_frequency_threshold = weighted_frequency_threshold
    self._num_rank_histogram_buckets = num_rank_histogram_buckets
Exemple #4
0
 def test_get_categorical_numeric_features(self):
     schema = text_format.Parse(
         """
     feature {
       name: "fa"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     feature {
       name: "fb"
       type: BYTES
     }
     feature {
       name: "fc"
       type: FLOAT
     }
     feature {
       name: "fc"
       type: INT
       bool_domain{
         name: "fc_bool_domain"
       }
     }
     """, schema_pb2.Schema())
     self.assertEqual(schema_util.get_categorical_numeric_features(schema),
                      ['fa', 'fc'])
Exemple #5
0
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name='CommonStatsGenerator',
            schema=None,
            weight_feature=None,
            num_values_histogram_buckets=10,
            epsilon=0.01):
        """Initializes a common statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      weight_feature: An optional feature name whose numeric value represents
          the weight of an example.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
        super(CommonStatsGenerator, self).__init__(name, schema)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_values_histogram_buckets = num_values_histogram_buckets
        # Initialize quantiles combiner.
        self._quantiles_combiner = quantiles_util.QuantilesCombiner(
            self._num_values_histogram_buckets, epsilon)
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name='NumericStatsGenerator',
            schema=None,
            num_histogram_buckets=10,
            num_quantiles_histogram_buckets=10,
            epsilon=0.01):
        """Initializes a numeric statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
        super(NumericStatsGenerator, self).__init__(name)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._num_histogram_buckets = num_histogram_buckets
        self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets
        num_buckets = max(
            self._num_quantiles_histogram_buckets,
            _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
            self._num_histogram_buckets)
        # Initialize quantiles combiner.
        self._quantiles_combiner = quantiles_util.QuantilesCombiner(
            num_buckets, epsilon)
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name: Text = 'TopKUniquesCombinerStatsGenerator',
            schema: Optional[schema_pb2.Schema] = None,
            weight_feature: Optional[types.FeatureName] = None,
            num_top_values: int = 2,
            frequency_threshold: int = 1,
            weighted_frequency_threshold: float = 1.0,
            num_rank_histogram_buckets: int = 1000) -> None:
        """Initializes a top-k and uniques combiner statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      weight_feature: Feature name whose numeric value represents the weight of
        an example. None if there is no weight feature.
      num_top_values: The number of most frequent feature values to keep for
        string features.
      frequency_threshold: An optional minimum number of examples
        the most frequent values must be present in (defaults to 1).
      weighted_frequency_threshold: An optional minimum weighted
        number of examples the most frequent weighted values must be
        present in (defaults to 1.0).
      num_rank_histogram_buckets: The number of buckets in the rank histogram
        for string features.
    """
        super(TopKUniquesCombinerStatsGenerator, self).__init__(name, schema)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_top_values = num_top_values
        self._frequency_threshold = frequency_threshold
        self._weighted_frequency_threshold = weighted_frequency_threshold
        self._num_rank_histogram_buckets = num_rank_histogram_buckets
  def __init__(self, schema):
    """Initializes unique stats generator ptransform.

    Args:
      schema: An schema for the dataset. None if no schema is available.
    """
    self._categorical_features = set(
        schema_util.get_categorical_numeric_features(schema) if schema else [])
    def __init__(self,
                 name: Text = "TopKUniquesSketchStatsGenerator",
                 schema: Optional[schema_pb2.Schema] = None,
                 example_weight_map: ExampleWeightMap = ExampleWeightMap(),
                 num_top_values: int = 2,
                 num_rank_histogram_buckets: int = 128,
                 frequency_threshold: int = 1,
                 weighted_frequency_threshold: float = 1.0,
                 num_misragries_buckets: int = 128,
                 num_kmv_buckets: int = 128,
                 store_output_in_custom_stats: bool = False):
        """Initializes a top-k and uniques sketch combiner statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
        corresponding weight column.
      num_top_values: The number of most frequent feature values to keep for
        string features.
      num_rank_histogram_buckets: The number of buckets in the rank histogram
        for string features.
      frequency_threshold: An optional minimum number of examples the most
        frequent values must be present in (defaults to 1).
      weighted_frequency_threshold: An optional minimum weighted number of
        examples the most frequent weighted values must be present in (defaults
        to 1.0).
      num_misragries_buckets: Number of buckets to use for MisraGries sketch.
      num_kmv_buckets: Number of buckets to use for KMV sketch.
      store_output_in_custom_stats: Boolean to indicate if the output stats need
        to be stored in custom stats. If False, the output is stored in
        `uniques` and `rank_histogram` fields.
    """
        super(
            TopKUniquesSketchStatsGenerator,
            self,
        ).__init__(name, schema)
        self._num_misragries_buckets = num_misragries_buckets
        self._num_kmv_buckets = num_kmv_buckets
        self._num_top_values = num_top_values
        self._example_weight_map = example_weight_map
        self._num_rank_histogram_buckets = num_rank_histogram_buckets
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._bytes_features = frozenset(
            schema_util.get_bytes_features(schema) if schema else [])
        self._frequency_threshold = frequency_threshold
        self._weighted_frequency_threshold = weighted_frequency_threshold
        self._store_output_in_custom_stats = store_output_in_custom_stats
        self._num_top_values_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_top_values")
        self._num_rank_histogram_buckets_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_rank_histogram_buckets")
        self._num_mg_buckets_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_mg_buckets")
        self._num_kmv_buckets_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_kmv_buckets")
Exemple #10
0
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name: Text = 'BasicStatsGenerator',
            schema: Optional[schema_pb2.Schema] = None,
            weight_feature: Optional[types.FeatureName] = None,
            num_values_histogram_buckets: Optional[int] = 10,
            num_histogram_buckets: Optional[int] = 10,
            num_quantiles_histogram_buckets: Optional[int] = 10,
            epsilon: Optional[float] = 0.01) -> None:
        """Initializes basic statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      weight_feature: An optional feature name whose numeric value represents
          the weight of an example.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
        super(BasicStatsGenerator, self).__init__(name, schema)

        self._bytes_features = set(
            schema_util.get_bytes_features(schema) if schema else [])
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._weight_feature = weight_feature
        self._num_values_histogram_buckets = num_values_histogram_buckets
        # Initialize quantiles combiner for histogram over number of values.
        self._num_values_quantiles_combiner = quantiles_util.QuantilesCombiner(
            self._num_values_histogram_buckets, epsilon)

        self._num_histogram_buckets = num_histogram_buckets
        self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets
        num_buckets = max(
            self._num_quantiles_histogram_buckets,
            _NUM_QUANTILES_FACTOR_FOR_STD_HISTOGRAM *
            self._num_histogram_buckets)
        assert num_buckets % self._num_quantiles_histogram_buckets == 0
        # Initialize quantiles combiner for histogram over feature values.
        self._values_quantiles_combiner = quantiles_util.QuantilesCombiner(
            num_buckets, epsilon, has_weights=True)
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name='StringStatsGenerator',
            schema=None):
        """Initializes a string statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
    """
        super(StringStatsGenerator, self).__init__(name, schema)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
  def __init__(
      self,  # pylint: disable=useless-super-delegation
      name: Text = 'BasicStatsGenerator',
      schema: Optional[schema_pb2.Schema] = None,
      example_weight_map: ExampleWeightMap = ExampleWeightMap(),
      num_values_histogram_buckets: Optional[int] = 10,
      num_histogram_buckets: Optional[int] = 10,
      num_quantiles_histogram_buckets: Optional[int] = 10,
      epsilon: Optional[float] = 0.01) -> None:
    """Initializes basic statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
          corresponding weight column.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
    super(BasicStatsGenerator, self).__init__(name, schema)

    self._bytes_features = set(
        schema_util.get_bytes_features(schema) if schema else [])
    self._categorical_features = set(
        schema_util.get_categorical_numeric_features(schema) if schema else [])
    self._example_weight_map = example_weight_map
    self._num_values_histogram_buckets = num_values_histogram_buckets
    self._num_histogram_buckets = num_histogram_buckets
    self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets

    self._make_quantiles_sketch_fn = lambda: sketches.QuantilesSketch(  # pylint: disable=g-long-lambda
        eps=epsilon,
        max_num_elements=1 << 32,
        num_streams=1)