Beispiel #1
0
    def __init__(self, example_weight_map: ExampleWeightMap, **kwargs):
        """Initializes a weighted lift statistics generator.

    Args:
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
        corresponding weight column.
      **kwargs: The set of args to be passed to _LiftStatsGenerator.
    """
        self._unweighted_generator = _LiftStatsGenerator(
            example_weight_map=ExampleWeightMap(), **kwargs)
        self._has_any_weight = bool(example_weight_map.all_weight_features())
        if self._has_any_weight:
            self._weighted_generator = _LiftStatsGenerator(
                example_weight_map=example_weight_map, **kwargs)
Beispiel #2
0
    def __init__(
            self,  # pylint: disable=useless-super-delegation
            name: Text = 'TopKUniquesCombinerStatsGenerator',
            schema: Optional[schema_pb2.Schema] = None,
            example_weight_map: ExampleWeightMap = ExampleWeightMap(),
            num_top_values: int = 2,
            frequency_threshold: int = 1,
            weighted_frequency_threshold: float = 1.0,
            num_rank_histogram_buckets: int = 1000) -> None:
        """Initializes a top-k and uniques combiner statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
          corresponding weight column.
      num_top_values: The number of most frequent feature values to keep for
        string features.
      frequency_threshold: An optional minimum number of examples
        the most frequent values must be present in (defaults to 1).
      weighted_frequency_threshold: An optional minimum weighted
        number of examples the most frequent weighted values must be
        present in (defaults to 1.0).
      num_rank_histogram_buckets: The number of buckets in the rank histogram
        for string features.
    """
        super(TopKUniquesCombinerStatsGenerator, self).__init__(name, schema)
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._example_weight_map = example_weight_map
        self._num_top_values = num_top_values
        self._frequency_threshold = frequency_threshold
        self._weighted_frequency_threshold = weighted_frequency_threshold
        self._num_rank_histogram_buckets = num_rank_histogram_buckets
Beispiel #3
0
 def __init__(self,
              y_path: types.FeaturePath,
              schema: Optional[schema_pb2.Schema] = None,
              x_paths: Optional[Iterable[types.FeaturePath]] = None,
              y_boundaries: Optional[Sequence[float]] = None,
              min_x_count: int = 0,
              top_k_per_y: Optional[int] = None,
              bottom_k_per_y: Optional[int] = None,
              example_weight_map: ExampleWeightMap = ExampleWeightMap(),
              output_custom_stats: Optional[bool] = False,
              name: Text = 'LiftStatsGenerator') -> None:
     super(LiftStatsGenerator, self).__init__(
         name,
         ptransform=_UnweightedAndWeightedLiftStatsGenerator(
             example_weight_map=example_weight_map,
             schema=schema,
             y_path=y_path,
             x_paths=x_paths,
             y_boundaries=y_boundaries,
             min_x_count=min_x_count,
             top_k_per_y=top_k_per_y,
             bottom_k_per_y=bottom_k_per_y,
             output_custom_stats=output_custom_stats,
             name=name),
         schema=schema)
  def __init__(self,
               name: Text = 'TopKUniquesStatsGenerator',
               schema: Optional[schema_pb2.Schema] = None,
               example_weight_map: ExampleWeightMap = ExampleWeightMap(),
               num_top_values: int = 2,
               frequency_threshold: int = 1,
               weighted_frequency_threshold: float = 1.0,
               num_rank_histogram_buckets: int = 1000) -> None:
    """Initializes top-k and uniques stats generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: An optional feature name whose numeric value
          (must be of type INT or FLOAT) represents the weight of an example.
      num_top_values: An optional number of most frequent feature values to keep
          for string features (defaults to 2).
      frequency_threshold: An optional minimum number of examples
        the most frequent values must be present in (defaults to 1).
      weighted_frequency_threshold: An optional minimum weighted
        number of examples the most frequent weighted values must be
        present in (defaults to 1.0).
      num_rank_histogram_buckets: An optional number of buckets in the rank
          histogram for string features (defaults to 1000).
    """
    super(TopKUniquesStatsGenerator, self).__init__(
        name,
        schema=schema,
        ptransform=_ComputeTopKUniquesStats(
            schema=schema,
            example_weight_map=example_weight_map,
            num_top_values=num_top_values,
            frequency_threshold=frequency_threshold,
            weighted_frequency_threshold=weighted_frequency_threshold,
            num_rank_histogram_buckets=num_rank_histogram_buckets))
    def __init__(self,
                 name: Text = "TopKUniquesSketchStatsGenerator",
                 schema: Optional[schema_pb2.Schema] = None,
                 example_weight_map: ExampleWeightMap = ExampleWeightMap(),
                 num_top_values: int = 2,
                 num_rank_histogram_buckets: int = 128,
                 frequency_threshold: int = 1,
                 weighted_frequency_threshold: float = 1.0,
                 num_misragries_buckets: int = 128,
                 num_kmv_buckets: int = 128,
                 store_output_in_custom_stats: bool = False):
        """Initializes a top-k and uniques sketch combiner statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
        corresponding weight column.
      num_top_values: The number of most frequent feature values to keep for
        string features.
      num_rank_histogram_buckets: The number of buckets in the rank histogram
        for string features.
      frequency_threshold: An optional minimum number of examples the most
        frequent values must be present in (defaults to 1).
      weighted_frequency_threshold: An optional minimum weighted number of
        examples the most frequent weighted values must be present in (defaults
        to 1.0).
      num_misragries_buckets: Number of buckets to use for MisraGries sketch.
      num_kmv_buckets: Number of buckets to use for KMV sketch.
      store_output_in_custom_stats: Boolean to indicate if the output stats need
        to be stored in custom stats. If False, the output is stored in
        `uniques` and `rank_histogram` fields.
    """
        super(
            TopKUniquesSketchStatsGenerator,
            self,
        ).__init__(name, schema)
        self._num_misragries_buckets = num_misragries_buckets
        self._num_kmv_buckets = num_kmv_buckets
        self._num_top_values = num_top_values
        self._example_weight_map = example_weight_map
        self._num_rank_histogram_buckets = num_rank_histogram_buckets
        self._categorical_features = set(
            schema_util.get_categorical_numeric_features(schema
                                                         ) if schema else [])
        self._bytes_features = frozenset(
            schema_util.get_bytes_features(schema) if schema else [])
        self._frequency_threshold = frequency_threshold
        self._weighted_frequency_threshold = weighted_frequency_threshold
        self._store_output_in_custom_stats = store_output_in_custom_stats
        self._num_top_values_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_top_values")
        self._num_rank_histogram_buckets_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_rank_histogram_buckets")
        self._num_mg_buckets_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_mg_buckets")
        self._num_kmv_buckets_gauge = beam.metrics.Metrics.gauge(
            constants.METRICS_NAMESPACE, "num_kmv_buckets")
Beispiel #6
0
def _to_partial_x_counts(
    sliced_record_batch: types.SlicedRecordBatch,
    x_paths: Iterable[types.FeaturePath], example_weight_map: ExampleWeightMap
) -> Iterator[Tuple[_SlicedXKey, _CountType]]:
    """Yields per-(slice, x_path, x) counts of the examples with x in x_path."""
    for x_path in x_paths:
        for (slice_key, x), x_count in _to_partial_counts(
                sliced_record_batch,
                x_path,
                boundaries=None,
                weight_column_name=example_weight_map.get(x_path)):
            yield _SlicedXKey(slice_key, x_path.steps(), x), x_count
 def testEnumerateArraysStringWeight(self):
     # The arrow type of a string changes between py2 and py3 so we accept either
     with self.assertRaisesRegex(
             ValueError,
             r'Weight column "w" must be of numeric type. Found (string|binary).*'
     ):
         for _ in arrow_util.enumerate_arrays(
                 pa.RecordBatch.from_arrays(
                     [pa.array([[1], [2, 3]]),
                      pa.array([["a"], ["b"]])], ["v", "w"]),
                 example_weight_map=ExampleWeightMap(
                     weight_feature="w", per_feature_override=None),
                 enumerate_leaves_only=True):
             pass
  def __init__(
      self,  # pylint: disable=useless-super-delegation
      name: Text = 'BasicStatsGenerator',
      schema: Optional[schema_pb2.Schema] = None,
      example_weight_map: ExampleWeightMap = ExampleWeightMap(),
      num_values_histogram_buckets: Optional[int] = 10,
      num_histogram_buckets: Optional[int] = 10,
      num_quantiles_histogram_buckets: Optional[int] = 10,
      epsilon: Optional[float] = 0.01) -> None:
    """Initializes basic statistics generator.

    Args:
      name: An optional unique name associated with the statistics generator.
      schema: An optional schema for the dataset.
      example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
          corresponding weight column.
      num_values_histogram_buckets: An optional number of buckets in a quantiles
          histogram for the number of values per Feature, which is stored in
          CommonStatistics.num_values_histogram.
      num_histogram_buckets: An optional number of buckets in a standard
          NumericStatistics.histogram with equal-width buckets.
      num_quantiles_histogram_buckets: An optional number of buckets in a
          quantiles NumericStatistics.histogram.
      epsilon: An optional error tolerance for the computation of quantiles,
          typically a small fraction close to zero (e.g. 0.01). Higher values
          of epsilon increase the quantile approximation, and hence result in
          more unequal buckets, but could improve performance, and resource
          consumption.
    """
    super(BasicStatsGenerator, self).__init__(name, schema)

    self._bytes_features = set(
        schema_util.get_bytes_features(schema) if schema else [])
    self._categorical_features = set(
        schema_util.get_categorical_numeric_features(schema) if schema else [])
    self._example_weight_map = example_weight_map
    self._num_values_histogram_buckets = num_values_histogram_buckets
    self._num_histogram_buckets = num_histogram_buckets
    self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets

    self._make_quantiles_sketch_fn = lambda: sketches.QuantilesSketch(  # pylint: disable=g-long-lambda
        eps=epsilon,
        max_num_elements=1 << 32,
        num_streams=1)
Beispiel #9
0
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    example_weight_map: ExampleWeightMap,
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
        int, Tuple[int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from the input."""
    slice_key, record_batch = sliced_record_batch

    has_any_weight = bool(example_weight_map.all_weight_features())
    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            record_batch,
            example_weight_map=example_weight_map,
            enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array_type)
        if feature_path in bytes_features:
            continue
        if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
             and feature_path in categorical_features) or feature_type
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values, parent_indices = arrow_util.flatten_nested(
                feature_array, weights is not None)
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = np.asarray(flattened_values)
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                if has_any_weight:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value),
                               (count, 1))
                else:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value), count)
    def test_topk_struct_leaves(self):
        batches = [
            pa.RecordBatch.from_arrays([
                pa.array([[1.0], [2.0]]),
                pa.array([[{
                    'f1': ['a', 'b'],
                    'f2': [1, 2]
                }, {
                    'f1': ['b'],
                }], [{
                    'f1': ['c', 'd'],
                    'f2': [2, 3]
                }, {
                    'f2': [3]
                }]]),
            ], ['w', 'c']),
            pa.RecordBatch.from_arrays([
                pa.array([[3.0]]),
                pa.array([[{
                    'f1': ['d'],
                    'f2': [4]
                }]]),
            ], ['w', 'c']),
        ]
        schema = text_format.Parse(
            """
        feature {
          name: "c"
          type: STRUCT
          struct_domain {
            feature {
              name: "f2"
              type: INT
              int_domain {
                is_categorical: true
              }
            }
          }
        }
        """, schema_pb2.Schema())
        expected_result = {
            types.FeaturePath(['c', 'f1']):
            text_format.Parse(
                """
              string_stats {
                unique: 4
                top_values {
                  value: "d"
                  frequency: 2.0
                }
                top_values {
                  value: "b"
                  frequency: 2.0
                }
                top_values {
                  value: "c"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "d"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "b"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "c"
                    sample_count: 1.0
                  }
                }
                weighted_string_stats {
                  top_values {
                    value: "d"
                    frequency: 5.0
                  }
                  top_values {
                    value: "c"
                    frequency: 2.0
                  }
                  top_values {
                    value: "b"
                    frequency: 2.0
                  }
                  rank_histogram {
                    buckets {
                      label: "d"
                      sample_count: 5.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "c"
                      sample_count: 2.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "b"
                      sample_count: 2.0
                    }
                  }
                }
              }
              path {
                step: "c"
                step: "f1"
              }""", statistics_pb2.FeatureNameStatistics()),
            types.FeaturePath(['c', 'f2']):
            text_format.Parse(
                """
              string_stats {
                unique: 4
                top_values {
                  value: "3"
                  frequency: 2.0
                }
                top_values {
                  value: "2"
                  frequency: 2.0
                }
                top_values {
                  value: "4"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "3"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "2"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "4"
                    sample_count: 1.0
                  }
                }
                weighted_string_stats {
                  top_values {
                    value: "3"
                    frequency: 4.0
                  }
                  top_values {
                    value: "4"
                    frequency: 3.0
                  }
                  top_values {
                    value: "2"
                    frequency: 3.0
                  }
                  rank_histogram {
                    buckets {
                      label: "3"
                      sample_count: 4.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "4"
                      sample_count: 3.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "2"
                      sample_count: 3.0
                    }
                  }
                }
              }
              path {
                step: "c"
                step: "f2"
              }""", statistics_pb2.FeatureNameStatistics()),
        }
        generator = sketch_generator.TopKUniquesSketchStatsGenerator(
            schema=schema,
            example_weight_map=ExampleWeightMap(weight_feature='w'),
            num_top_values=3,
            num_rank_histogram_buckets=3)

        self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #11
0
def _to_partial_copresence_counts(
    sliced_record_batch: types.SlicedRecordBatch,
    y_path: types.FeaturePath,
    x_paths: Iterable[types.FeaturePath],
    y_boundaries: Optional[np.ndarray],
    example_weight_map: ExampleWeightMap,
    num_xy_pairs_batch_copresent: Optional[
        beam.metrics.metric.Metrics.DelegatingDistribution] = None
) -> Iterator[Tuple[_SlicedXYKey, _CountType]]:
    """Yields per-(slice, path_x, x, y) counts of examples with x and y.

  This method generates the number of times a given pair of y- and x-values
  appear in the same record, for a slice_key and x_path. Records in which either
  x or y is absent will be skipped.

  Args:
    sliced_record_batch: A tuple of (slice_key, record_batch) representing a
      slice of examples
    y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    x_paths: A set of x_paths for which to compute lift.
    y_boundaries: Optionally, a set of bin boundaries to use for binning y_path
      values.
    example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
        corresponding weight column.
    num_xy_pairs_batch_copresent: A counter tracking the number of different xy
      pairs that are copresent within each batch. If the same pair of xy values
      are copresent in more than one batch, this counter will be incremented
      once for each batch in which they are copresent.

  Yields:
    Tuples of the form (_SlicedXYKey(slice_key, x_path, x, y), count) for each
    combination of  x_path, x, and y  in the input record batch.
  """
    slice_key, record_batch = sliced_record_batch
    y_presence = _get_example_value_presence(record_batch,
                                             y_path,
                                             y_boundaries,
                                             weight_column_name=None)
    if y_presence is None:
        return
    ys_by_example = collections.defaultdict(list)
    for example_index, y in zip(y_presence.example_indices, y_presence.values):
        ys_by_example[example_index].append(y)
    for x_path in x_paths:
        weight_column_name = example_weight_map.get(x_path)
        x_presence = _get_example_value_presence(
            record_batch,
            x_path,
            boundaries=None,
            weight_column_name=weight_column_name)
        if x_presence is None:
            continue
        if weight_column_name is not None:
            copresence_counts = collections.defaultdict(float)
        else:
            copresence_counts = collections.defaultdict(int)

        for example_index, x, weight in zip(x_presence.example_indices,
                                            x_presence.values,
                                            x_presence.weights):
            for y in ys_by_example[example_index]:
                copresence_counts[(x, y)] += weight

        if num_xy_pairs_batch_copresent:
            num_xy_pairs_batch_copresent.update(len(copresence_counts))
        for (x, y), count in copresence_counts.items():
            yield (_SlicedXYKey(slice_key=slice_key,
                                x_path=x_path.steps(),
                                x=x,
                                y=y), count)
 def test_topk_uniques_sketch_with_weights_custom_stats(self):
     # non-weighted ordering
     # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
     # weighted ordering
     # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'
     batches = [
         pa.RecordBatch.from_arrays([
             pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']]),
             pa.array([[5.0], [5.0]]),
         ], ['fa', 'w']),
         pa.RecordBatch.from_arrays([
             pa.array([['d', 'e']]),
             pa.array([[15.0]]),
         ], ['fa', 'w']),
     ]
     expected_result = {
         types.FeaturePath(['fa']):
         text_format.Parse(
             """
             path {
               step: 'fa'
             }
             custom_stats {
               name: 'topk_sketch_rank_histogram'
               rank_histogram {
                 buckets {
                   low_rank: 0
                   high_rank: 0
                   label: "a"
                   sample_count: 3.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "e"
                   sample_count: 2.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "d"
                   sample_count: 2.0
                 }
               }
             }
             custom_stats {
               name: 'weighted_topk_sketch_rank_histogram'
                 rank_histogram {
                   buckets {
                     low_rank: 0
                     high_rank: 0
                     label: "e"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 1
                     high_rank: 1
                     label: "d"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 2
                     high_rank: 2
                     label: "a"
                     sample_count: 15.0
                   }
                 }
             }
             custom_stats {
               name: 'uniques_sketch_num_uniques'
               num: 5
             }""", statistics_pb2.FeatureNameStatistics())
     }
     generator = sketch_generator.TopKUniquesSketchStatsGenerator(
         example_weight_map=ExampleWeightMap(weight_feature='w'),
         num_top_values=4,
         num_rank_histogram_buckets=3,
         store_output_in_custom_stats=True)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
    def test_topk_with_frequency_threshold(self):
        batches = [
            pa.RecordBatch.from_arrays([
                pa.array([['a', 'b', 'y', 'b']]),
                pa.array([[5.0]]),
            ], ['fa', 'w']),
            pa.RecordBatch.from_arrays([
                pa.array([['a', 'x', 'a', 'z']]),
                pa.array([[15.0]]),
            ], ['fa', 'w'])
        ]
        expected_result = {
            types.FeaturePath(['fa']):
            text_format.Parse(
                """
        path {
          step: 'fa'
        }
        string_stats {
          unique: 5
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "b"
              sample_count: 2.0
            }
          }
          weighted_string_stats {
            top_values {
              value: 'a'
              frequency: 35.0
            }
            top_values {
              value: 'z'
              frequency: 15.0
            }
            top_values {
              value: 'x'
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                low_rank: 0
                high_rank: 0
                label: "a"
                sample_count: 35.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "z"
                sample_count: 15.0
              }
              buckets {
                low_rank: 2
                high_rank: 2
                label: "x"
                sample_count: 15.0
              }
            }
          }
        }""", statistics_pb2.FeatureNameStatistics())
        }

        generator = sketch_generator.TopKUniquesSketchStatsGenerator(
            example_weight_map=ExampleWeightMap(weight_feature='w'),
            num_top_values=5,
            frequency_threshold=2,
            weighted_frequency_threshold=15,
            num_rank_histogram_buckets=3)
        self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_topk_uniques_combiner_with_weights(self):
     # non-weighted ordering
     # fa: 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
     # fb: 1 'v', 1 'w', 1 'x', 1 'y', 1 'z'
     # weighted ordering
     # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'
     # fb: 6 'z', 4 'x', 4 'y', 4 'w', 2 'v'
     batches = [
         pa.RecordBatch.from_arrays([
             pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']]),
             pa.array([['v'], ['w', 'x', 'y']]),
             pa.array([[5.0], [5.0]]),
             pa.array([[2.0], [4.0]]),
         ], ['fa', 'fb', 'w', 'w_b']),
         pa.RecordBatch.from_arrays([
             pa.array([['d', 'e']]),
             pa.array([['z']]),
             pa.array([[15.0]]),
             pa.array([[6.0]]),
         ], ['fa', 'fb', 'w', 'w_b']),
     ]
     expected_result = {
         types.FeaturePath(['fa']):
         text_format.Parse(
             """
             path {
               step: 'fa'
             }
             string_stats {
               unique: 5
               top_values {
                 value: 'a'
                 frequency: 3.0
               }
               top_values {
                 value: 'e'
                 frequency: 2.0
               }
               top_values {
                 value: 'd'
                 frequency: 2.0
               }
               top_values {
                 value: 'c'
                 frequency: 2.0
               }
               rank_histogram {
                 buckets {
                   low_rank: 0
                   high_rank: 0
                   label: "a"
                   sample_count: 3.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "e"
                   sample_count: 2.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "d"
                   sample_count: 2.0
                 }
               }
               weighted_string_stats {
                 top_values {
                   value: 'e'
                   frequency: 20.0
                 }
                 top_values {
                   value: 'd'
                   frequency: 20.0
                 }
                 top_values {
                   value: 'a'
                   frequency: 15.0
                 }
                 top_values {
                   value: 'c'
                   frequency: 10.0
                 }
                 rank_histogram {
                   buckets {
                     low_rank: 0
                     high_rank: 0
                     label: "e"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 1
                     high_rank: 1
                     label: "d"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 2
                     high_rank: 2
                     label: "a"
                     sample_count: 15.0
                   }
                 }
               }
           }""", statistics_pb2.FeatureNameStatistics()),
         types.FeaturePath(['fb']):
         text_format.Parse(
             """
               string_stats {
                 unique: 5
                 top_values {
                   value: "z"
                   frequency: 1.0
                 }
                 top_values {
                   value: "y"
                   frequency: 1.0
                 }
                 top_values {
                   value: "x"
                   frequency: 1.0
                 }
                 top_values {
                   value: "w"
                   frequency: 1.0
                 }
                 rank_histogram {
                   buckets {
                     label: "z"
                     sample_count: 1.0
                   }
                   buckets {
                     low_rank: 1
                     high_rank: 1
                     label: "y"
                     sample_count: 1.0
                   }
                   buckets {
                     low_rank: 2
                     high_rank: 2
                     label: "x"
                     sample_count: 1.0
                   }
                 }
                 weighted_string_stats {
                   top_values {
                     value: "z"
                     frequency: 6.0
                   }
                   top_values {
                     value: "y"
                     frequency: 4.0
                   }
                   top_values {
                     value: "x"
                     frequency: 4.0
                   }
                   top_values {
                     value: "w"
                     frequency: 4.0
                   }
                   rank_histogram {
                     buckets {
                       label: "z"
                       sample_count: 6.0
                     }
                     buckets {
                       low_rank: 1
                       high_rank: 1
                       label: "y"
                       sample_count: 4.0
                     }
                     buckets {
                       low_rank: 2
                       high_rank: 2
                       label: "x"
                       sample_count: 4.0
                     }
                   }
                 }
               }
               path {
                 step: "fb"
               }""", statistics_pb2.FeatureNameStatistics()),
     }
     generator = sketch_generator.TopKUniquesSketchStatsGenerator(
         example_weight_map=ExampleWeightMap(
             weight_feature='w',
             per_feature_override={types.FeaturePath(['fb']): 'w_b'}),
         num_top_values=4,
         num_rank_histogram_buckets=3)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
 def test_topk_uniques_sketch_with_int_weights(self):
     # non-weighted ordering
     # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
     # weighted ordering
     # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'
     batches = [
         pa.RecordBatch.from_arrays([
             pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']],
                      type=pa.list_(pa.binary())),
             pa.array([[5], [5]], type=pa.list_(pa.int32())),
         ], ['fa', 'w']),
         pa.RecordBatch.from_arrays([
             pa.array([['d', 'e']], type=pa.list_(pa.binary())),
             pa.array([[15]], type=pa.list_(pa.int32())),
         ], ['fa', 'w']),
     ]
     expected_result = {
         types.FeaturePath(['fa']):
         text_format.Parse(
             """
             path {
               step: 'fa'
             }
             string_stats {
               unique: 5
               top_values {
                 value: 'a'
                 frequency: 3.0
               }
               top_values {
                 value: 'e'
                 frequency: 2.0
               }
               top_values {
                 value: 'd'
                 frequency: 2.0
               }
               top_values {
                 value: 'c'
                 frequency: 2.0
               }
               rank_histogram {
                 buckets {
                   low_rank: 0
                   high_rank: 0
                   label: "a"
                   sample_count: 3.0
                 }
                 buckets {
                   low_rank: 1
                   high_rank: 1
                   label: "e"
                   sample_count: 2.0
                 }
                 buckets {
                   low_rank: 2
                   high_rank: 2
                   label: "d"
                   sample_count: 2.0
                 }
               }
               weighted_string_stats {
                 top_values {
                   value: 'e'
                   frequency: 20.0
                 }
                 top_values {
                   value: 'd'
                   frequency: 20.0
                 }
                 top_values {
                   value: 'a'
                   frequency: 15.0
                 }
                 top_values {
                   value: 'c'
                   frequency: 10.0
                 }
                 rank_histogram {
                   buckets {
                     low_rank: 0
                     high_rank: 0
                     label: "e"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 1
                     high_rank: 1
                     label: "d"
                     sample_count: 20.0
                   }
                   buckets {
                     low_rank: 2
                     high_rank: 2
                     label: "a"
                     sample_count: 15.0
                   }
                 }
               }
           }""", statistics_pb2.FeatureNameStatistics())
     }
     generator = sketch_generator.TopKUniquesSketchStatsGenerator(
         example_weight_map=ExampleWeightMap(weight_feature='w'),
         num_top_values=4,
         num_rank_histogram_buckets=3)
     self.assertCombinerOutputEqual(batches, generator, expected_result)
Beispiel #16
0
def enumerate_arrays(
    record_batch: pa.RecordBatch,
    example_weight_map: Optional[ExampleWeightMap],
    enumerate_leaves_only: bool,
    wrap_flat_struct_in_list: bool = True,
) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
  """Enumerates arrays in a RecordBatch.

  Define:
    primitive: primitive arrow arrays (e.g. Int64Array).
    nested_list := list<nested_list> | list<primitive> | null
    # note: a null array can be seen as a list<primitive>, which contains only
    #   nulls and the type of the primitive is unknown.
    # example:
    #   null,
    #   list<null>,  # like list<list<unknown_type>> with only null values.
    #   list<list<int64>>,
    struct := struct<{field: nested_list | struct}> | list<struct>
    # example:
    #   struct<{"foo": list<int64>},
    #   list<struct<{"foo": list<int64>}>>,
    #   struct<{"foo": struct<{"bar": list<list<int64>>}>}>

  This function assumes `record_batch` contains only nested_list and struct
  columns. It enumerates each column in `record_batch`, and if that column is
  a struct, it flattens the outer lists wrapping it (if any), and recursively
  enumerates the array of each field in the struct (also see
  `enumerate_leaves_only`).

  The weights get "aligned" automatically in this process, therefore weights,
  the third term in the returned tuple always has enumerated_array[i]'s weight
  being weights[i].

  A FeaturePath is included in the result to address the enumerated array.
  Note that the FeaturePath merely addresses in the `record_batch` and struct
  arrays. It does not indicate whether / how a struct array is nested.

  Args:
    record_batch: The RecordBatch whose arrays to be visited.
    example_weight_map: an ExampleWeightMap that maps a FeaturePath to its
      corresponding weight column.
    enumerate_leaves_only: If True, only enumerate leaf arrays. A leaf array
      is an array whose type does not have any struct nested in.
      Otherwise, also enumerate the struct arrays where the leaf arrays are
      contained.
    wrap_flat_struct_in_list: if True, and if a struct<[Ts]> array is
      encountered, it will be wrapped in a list array, so it becomes a
      list<struct<[Ts]>>, in which each sub-list contains one element.
      A caller can make use of this option to assume all the arrays enumerated
      here are list<inner_type>.
  Yields:
    A tuple. The first term is the path of the feature; the second term is
    the feature array and the third term is the weight array for the feature
    array (i.e. weights[i] is the weight for array[i]).

  Raises:
    ValueError: When the weight column is not a list array whose elements are
      1-element lists.
  """

  def _recursion_helper(
      feature_path: types.FeaturePath, array: pa.Array,
      all_weights: Dict[types.FeatureName, np.ndarray],
  ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
    """Recursion helper."""
    array_type = array.type
    innermost_nested_type = get_innermost_nested_type(array_type)
    if pa.types.is_struct(innermost_nested_type):
      if not enumerate_leaves_only:
        weights = all_weights.get(example_weight_map.get(feature_path))
        # special handing for a flat struct array -- wrap it in a ListArray
        # whose elements are singleton lists. This way downstream can keep
        # assuming the enumerated arrays are list<*>.
        to_yield = array
        if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
          to_yield = array_util.ToSingletonListArray(array)
        yield (feature_path, to_yield, weights)
      flat_struct_array, parent_indices = flatten_nested(
          array, bool(all_weights))
      # Potential optimization:
      # Only flatten weights that we know will be used in the recursion.
      flat_all_weights = {
          weight_feature_name: w[parent_indices]
          for weight_feature_name, w in all_weights.items()
      }
      for field in flat_struct_array.type:
        field_name = field.name
        yield from _recursion_helper(
            feature_path.child(field_name), flat_struct_array.field(field_name),
            flat_all_weights)
    else:
      weights = all_weights.get(example_weight_map.get(feature_path))
      yield (feature_path, array, weights)

  if example_weight_map is None:
    example_weight_map = ExampleWeightMap(
        weight_feature=None, per_feature_override=None)
  all_weights = {
      weight_column: get_weight_feature(record_batch, weight_column)
      for weight_column in example_weight_map.all_weight_features()
  }

  for column_name, column in zip(record_batch.schema.names,
                                 record_batch.columns):
    yield from _recursion_helper(
        types.FeaturePath([column_name]), column, all_weights)
  def test_topk_uniques_with_weights(self):
    # non-weighted ordering
    # fa: 3 'a', 2 'e', 2 'd', 2 'c', 1 'b'
    # fb: 1 'v', 1 'w', 1 'x', 1 'y', 1 'z'
    # weighted ordering
    # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b'
    # fb: 6 'z', 4 'x', 4 'y', 4 'w', 2 'v'
    examples = [
        pa.RecordBatch.from_arrays([
            pa.array([
                ['a', 'b', 'c', 'e'],
                ['a', 'c', 'd', 'a'],
                ['d', 'e'],
            ]),
            pa.array([[5.0], [5.0], [15.0]]),
            pa.array([['v'], ['w', 'x', 'y'], ['z']]),
            pa.array([[2], [4], [6]]),
        ], ['fa', 'w', 'fb', 'w_b'])
    ]

    expected_result = [
        text_format.Parse(
            """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                top_values {
                  value: 'a'
                  frequency: 3.0
                }
                top_values {
                  value: 'e'
                  frequency: 2.0
                }
                top_values {
                  value: 'd'
                  frequency: 2.0
                }
                top_values {
                  value: 'c'
                  frequency: 2.0
                }
                rank_histogram {
                  buckets {
                    low_rank: 0
                    high_rank: 0
                    label: "a"
                    sample_count: 3.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "e"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "d"
                    sample_count: 2.0
                  }
                }
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              type: STRING
              string_stats {
                top_values {
                  value: "z"
                  frequency: 1.0
                }
                top_values {
                  value: "y"
                  frequency: 1.0
                }
                top_values {
                  value: "x"
                  frequency: 1.0
                }
                top_values {
                  value: "w"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "z"
                    sample_count: 1.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "y"
                    sample_count: 1.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "x"
                    sample_count: 1.0
                  }
                }
              }
              path {
                step: "fb"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: 'e'
                    frequency: 20.0
                  }
                  top_values {
                    value: 'd'
                    frequency: 20.0
                  }
                  top_values {
                    value: 'a'
                    frequency: 15.0
                  }
                  top_values {
                    value: 'c'
                    frequency: 10.0
                  }
                  rank_histogram {
                    buckets {
                      low_rank: 0
                      high_rank: 0
                      label: "e"
                      sample_count: 20.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "d"
                      sample_count: 20.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "a"
                      sample_count: 15.0
                    }
                  }
                }
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              type: STRING
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: "z"
                    frequency: 6.0
                  }
                  top_values {
                    value: "y"
                    frequency: 4.0
                  }
                  top_values {
                    value: "x"
                    frequency: 4.0
                  }
                  top_values {
                    value: "w"
                    frequency: 4.0
                  }
                  rank_histogram {
                    buckets {
                      label: "z"
                      sample_count: 6.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "y"
                      sample_count: 4.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "x"
                      sample_count: 4.0
                    }
                  }
                }
              }
              path {
                step: "fb"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              path {
                step: 'fa'
              }
              type: STRING
              string_stats {
                unique: 5
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              type: STRING
              string_stats {
                unique: 5
              }
              path {
                step: "fb"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
    ]

    generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
        example_weight_map=ExampleWeightMap(
            weight_feature='w',
            per_feature_override={types.FeaturePath(['fb']): 'w_b'}),
        num_top_values=4, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
  def test_topk_uniques_with_struct_leaves(self):
    inputs = [
        pa.RecordBatch.from_arrays([
            pa.array([[1.0], [2.0]]),
            pa.array([[{
                'f1': ['a', 'b'],
                'f2': [1, 2]
            }, {
                'f1': ['b'],
            }], [{
                'f1': ['c', 'd'],
                'f2': [2, 3]
            }, {
                'f2': [3]
            }]]),
        ], ['w', 'c']),
        pa.RecordBatch.from_arrays([
            pa.array([[3.0]]),
            pa.array([[{
                'f1': ['d'],
                'f2': [4]
            }]]),
        ], ['w', 'c']),
    ]
    expected_result = [
        text_format.Parse(
            """
            features{
              type: STRING
              string_stats {
                top_values {
                  value: "d"
                  frequency: 2.0
                }
                top_values {
                  value: "b"
                  frequency: 2.0
                }
                top_values {
                  value: "c"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "d"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "b"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "c"
                    sample_count: 1.0
                  }
                }
              }
              path {
                step: "c"
                step: "f1"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
            features {
              string_stats {
                top_values {
                  value: "3"
                  frequency: 2.0
                }
                top_values {
                  value: "2"
                  frequency: 2.0
                }
                top_values {
                  value: "4"
                  frequency: 1.0
                }
                rank_histogram {
                  buckets {
                    label: "3"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 1
                    high_rank: 1
                    label: "2"
                    sample_count: 2.0
                  }
                  buckets {
                    low_rank: 2
                    high_rank: 2
                    label: "4"
                    sample_count: 1.0
                  }
                }
              }
              path {
                step: "c"
                step: "f2"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse("""
            features {
              type: STRING
              string_stats {
                unique: 4
              }
              path {
                step: "c"
                step: "f1"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse("""
            features {
              type: INT
              string_stats {
                unique: 4
              }
              path {
                step: "c"
                step: "f2"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse("""
            features {
              type: STRING
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: "d"
                    frequency: 5.0
                  }
                  top_values {
                    value: "c"
                    frequency: 2.0
                  }
                  top_values {
                    value: "b"
                    frequency: 2.0
                  }
                  rank_histogram {
                    buckets {
                      label: "d"
                      sample_count: 5.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "c"
                      sample_count: 2.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "b"
                      sample_count: 2.0
                    }
                  }
                }
              }
              path {
                step: "c"
                step: "f1"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse("""
            features {
              string_stats {
                weighted_string_stats {
                  top_values {
                    value: "3"
                    frequency: 4.0
                  }
                  top_values {
                    value: "4"
                    frequency: 3.0
                  }
                  top_values {
                    value: "2"
                    frequency: 3.0
                  }
                  rank_histogram {
                    buckets {
                      label: "3"
                      sample_count: 4.0
                    }
                    buckets {
                      low_rank: 1
                      high_rank: 1
                      label: "4"
                      sample_count: 3.0
                    }
                    buckets {
                      low_rank: 2
                      high_rank: 2
                      label: "2"
                      sample_count: 3.0
                    }
                  }
                }
              }
              path {
                step: "c"
                step: "f2"
              }
            }""", statistics_pb2.DatasetFeatureStatistics()),

    ]
    schema = text_format.Parse(
        """
        feature {
          name: "c"
          type: STRUCT
          struct_domain {
            feature {
              name: "f2"
              type: INT
              int_domain {
                is_categorical: true
              }
            }
          }
        }
        """, schema_pb2.Schema())
    generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
        schema=schema,
        example_weight_map=ExampleWeightMap(weight_feature='w'),
        num_top_values=3, num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        inputs,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)
    pa.array([
        {
            "sf1": [[1, 2], [3]],
            "sf2": [None],
        },
        None,
    ]),
    pa.array([[1], [2]]),
    pa.array([[2], [4]]),
    pa.array([[6], [8]]),
], ["f1", "f2", "f3", "w", "w_override1", "w_override2"])

_EXAMPLE_WEIGHT_MAP = ExampleWeightMap(
    weight_feature="w",
    per_feature_override={
        types.FeaturePath(["f2"]): "w_override1",
        types.FeaturePath(["f2", "sf1"]): "w_override2",
        types.FeaturePath(["f2", "sf2"]): "w_override2",
        types.FeaturePath(["f2", "sf2", "ssf1"]): "w_override1",
    })

ExpectedArray = tfx_namedtuple.namedtuple(
    "ExpectedArray", ["array", "parent_indices", "weights"])
_FEATURES_TO_ARRAYS = {
    types.FeaturePath(["f1"]):
    ExpectedArray(pa.array([[1], [2, 3]]), [0, 1], [1, 2]),
    types.FeaturePath(["w"]):
    ExpectedArray(pa.array([[1], [2]]), [0, 1], [1, 2]),
    types.FeaturePath(["w_override1"]):
    ExpectedArray(pa.array([[2], [4]]), [0, 1], [1, 2]),
    types.FeaturePath(["w_override2"]):
    ExpectedArray(pa.array([[6], [8]]), [0, 1], [1, 2]),
  def test_topk_uniques_with_frequency_threshold(self):
    examples = [
        pa.RecordBatch.from_arrays([
            pa.array([['a', 'b', 'y', 'b'], ['a', 'x', 'a', 'z']]),
            pa.array([[5.0], [15.0]])
        ], ['fa', 'w'])
    ]

    expected_result = [
        text_format.Parse(
            """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          top_values {
            value: 'a'
            frequency: 3
          }
          top_values {
            value: 'b'
            frequency: 2
          }
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 3.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "b"
              sample_count: 2.0
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          weighted_string_stats {
            top_values {
              value: 'a'
              frequency: 35.0
            }
            top_values {
              value: 'z'
              frequency: 15.0
            }
            top_values {
              value: 'x'
              frequency: 15.0
            }
            rank_histogram {
              buckets {
                low_rank: 0
                high_rank: 0
                label: "a"
                sample_count: 35.0
              }
              buckets {
                low_rank: 1
                high_rank: 1
                label: "z"
                sample_count: 15.0
              }
              buckets {
                low_rank: 2
                high_rank: 2
                label: "x"
                sample_count: 15.0
              }
            }
          }
        }
    }""", statistics_pb2.DatasetFeatureStatistics()),
        text_format.Parse(
            """
      features {
        path {
          step: 'fa'
        }
        type: STRING
        string_stats {
          unique: 5
        }
      }""", statistics_pb2.DatasetFeatureStatistics()),
    ]

    generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator(
        example_weight_map=ExampleWeightMap(weight_feature='w'),
        num_top_values=5,
        frequency_threshold=2,
        weighted_frequency_threshold=15,
        num_rank_histogram_buckets=3)
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True)