def __init__(self, example_weight_map: ExampleWeightMap, **kwargs): """Initializes a weighted lift statistics generator. Args: example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. **kwargs: The set of args to be passed to _LiftStatsGenerator. """ self._unweighted_generator = _LiftStatsGenerator( example_weight_map=ExampleWeightMap(), **kwargs) self._has_any_weight = bool(example_weight_map.all_weight_features()) if self._has_any_weight: self._weighted_generator = _LiftStatsGenerator( example_weight_map=example_weight_map, **kwargs)
def __init__( self, # pylint: disable=useless-super-delegation name: Text = 'TopKUniquesCombinerStatsGenerator', schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_top_values: int = 2, frequency_threshold: int = 1, weighted_frequency_threshold: float = 1.0, num_rank_histogram_buckets: int = 1000) -> None: """Initializes a top-k and uniques combiner statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_top_values: The number of most frequent feature values to keep for string features. frequency_threshold: An optional minimum number of examples the most frequent values must be present in (defaults to 1). weighted_frequency_threshold: An optional minimum weighted number of examples the most frequent weighted values must be present in (defaults to 1.0). num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. """ super(TopKUniquesCombinerStatsGenerator, self).__init__(name, schema) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._example_weight_map = example_weight_map self._num_top_values = num_top_values self._frequency_threshold = frequency_threshold self._weighted_frequency_threshold = weighted_frequency_threshold self._num_rank_histogram_buckets = num_rank_histogram_buckets
def __init__(self, y_path: types.FeaturePath, schema: Optional[schema_pb2.Schema] = None, x_paths: Optional[Iterable[types.FeaturePath]] = None, y_boundaries: Optional[Sequence[float]] = None, min_x_count: int = 0, top_k_per_y: Optional[int] = None, bottom_k_per_y: Optional[int] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), output_custom_stats: Optional[bool] = False, name: Text = 'LiftStatsGenerator') -> None: super(LiftStatsGenerator, self).__init__( name, ptransform=_UnweightedAndWeightedLiftStatsGenerator( example_weight_map=example_weight_map, schema=schema, y_path=y_path, x_paths=x_paths, y_boundaries=y_boundaries, min_x_count=min_x_count, top_k_per_y=top_k_per_y, bottom_k_per_y=bottom_k_per_y, output_custom_stats=output_custom_stats, name=name), schema=schema)
def __init__(self, name: Text = 'TopKUniquesStatsGenerator', schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_top_values: int = 2, frequency_threshold: int = 1, weighted_frequency_threshold: float = 1.0, num_rank_histogram_buckets: int = 1000) -> None: """Initializes top-k and uniques stats generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: An optional feature name whose numeric value (must be of type INT or FLOAT) represents the weight of an example. num_top_values: An optional number of most frequent feature values to keep for string features (defaults to 2). frequency_threshold: An optional minimum number of examples the most frequent values must be present in (defaults to 1). weighted_frequency_threshold: An optional minimum weighted number of examples the most frequent weighted values must be present in (defaults to 1.0). num_rank_histogram_buckets: An optional number of buckets in the rank histogram for string features (defaults to 1000). """ super(TopKUniquesStatsGenerator, self).__init__( name, schema=schema, ptransform=_ComputeTopKUniquesStats( schema=schema, example_weight_map=example_weight_map, num_top_values=num_top_values, frequency_threshold=frequency_threshold, weighted_frequency_threshold=weighted_frequency_threshold, num_rank_histogram_buckets=num_rank_histogram_buckets))
def __init__(self, name: Text = "TopKUniquesSketchStatsGenerator", schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_top_values: int = 2, num_rank_histogram_buckets: int = 128, frequency_threshold: int = 1, weighted_frequency_threshold: float = 1.0, num_misragries_buckets: int = 128, num_kmv_buckets: int = 128, store_output_in_custom_stats: bool = False): """Initializes a top-k and uniques sketch combiner statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_top_values: The number of most frequent feature values to keep for string features. num_rank_histogram_buckets: The number of buckets in the rank histogram for string features. frequency_threshold: An optional minimum number of examples the most frequent values must be present in (defaults to 1). weighted_frequency_threshold: An optional minimum weighted number of examples the most frequent weighted values must be present in (defaults to 1.0). num_misragries_buckets: Number of buckets to use for MisraGries sketch. num_kmv_buckets: Number of buckets to use for KMV sketch. store_output_in_custom_stats: Boolean to indicate if the output stats need to be stored in custom stats. If False, the output is stored in `uniques` and `rank_histogram` fields. """ super( TopKUniquesSketchStatsGenerator, self, ).__init__(name, schema) self._num_misragries_buckets = num_misragries_buckets self._num_kmv_buckets = num_kmv_buckets self._num_top_values = num_top_values self._example_weight_map = example_weight_map self._num_rank_histogram_buckets = num_rank_histogram_buckets self._categorical_features = set( schema_util.get_categorical_numeric_features(schema ) if schema else []) self._bytes_features = frozenset( schema_util.get_bytes_features(schema) if schema else []) self._frequency_threshold = frequency_threshold self._weighted_frequency_threshold = weighted_frequency_threshold self._store_output_in_custom_stats = store_output_in_custom_stats self._num_top_values_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_top_values") self._num_rank_histogram_buckets_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_rank_histogram_buckets") self._num_mg_buckets_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_mg_buckets") self._num_kmv_buckets_gauge = beam.metrics.Metrics.gauge( constants.METRICS_NAMESPACE, "num_kmv_buckets")
def _to_partial_x_counts( sliced_record_batch: types.SlicedRecordBatch, x_paths: Iterable[types.FeaturePath], example_weight_map: ExampleWeightMap ) -> Iterator[Tuple[_SlicedXKey, _CountType]]: """Yields per-(slice, x_path, x) counts of the examples with x in x_path.""" for x_path in x_paths: for (slice_key, x), x_count in _to_partial_counts( sliced_record_batch, x_path, boundaries=None, weight_column_name=example_weight_map.get(x_path)): yield _SlicedXKey(slice_key, x_path.steps(), x), x_count
def testEnumerateArraysStringWeight(self): # The arrow type of a string changes between py2 and py3 so we accept either with self.assertRaisesRegex( ValueError, r'Weight column "w" must be of numeric type. Found (string|binary).*' ): for _ in arrow_util.enumerate_arrays( pa.RecordBatch.from_arrays( [pa.array([[1], [2, 3]]), pa.array([["a"], ["b"]])], ["v", "w"]), example_weight_map=ExampleWeightMap( weight_feature="w", per_feature_override=None), enumerate_leaves_only=True): pass
def __init__( self, # pylint: disable=useless-super-delegation name: Text = 'BasicStatsGenerator', schema: Optional[schema_pb2.Schema] = None, example_weight_map: ExampleWeightMap = ExampleWeightMap(), num_values_histogram_buckets: Optional[int] = 10, num_histogram_buckets: Optional[int] = 10, num_quantiles_histogram_buckets: Optional[int] = 10, epsilon: Optional[float] = 0.01) -> None: """Initializes basic statistics generator. Args: name: An optional unique name associated with the statistics generator. schema: An optional schema for the dataset. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_values_histogram_buckets: An optional number of buckets in a quantiles histogram for the number of values per Feature, which is stored in CommonStatistics.num_values_histogram. num_histogram_buckets: An optional number of buckets in a standard NumericStatistics.histogram with equal-width buckets. num_quantiles_histogram_buckets: An optional number of buckets in a quantiles NumericStatistics.histogram. epsilon: An optional error tolerance for the computation of quantiles, typically a small fraction close to zero (e.g. 0.01). Higher values of epsilon increase the quantile approximation, and hence result in more unequal buckets, but could improve performance, and resource consumption. """ super(BasicStatsGenerator, self).__init__(name, schema) self._bytes_features = set( schema_util.get_bytes_features(schema) if schema else []) self._categorical_features = set( schema_util.get_categorical_numeric_features(schema) if schema else []) self._example_weight_map = example_weight_map self._num_values_histogram_buckets = num_values_histogram_buckets self._num_histogram_buckets = num_histogram_buckets self._num_quantiles_histogram_buckets = num_quantiles_histogram_buckets self._make_quantiles_sketch_fn = lambda: sketches.QuantilesSketch( # pylint: disable=g-long-lambda eps=epsilon, max_num_elements=1 << 32, num_streams=1)
def _to_topk_tuples( sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], example_weight_map: ExampleWeightMap, ) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[ int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from the input.""" slice_key, record_batch = sliced_record_batch has_any_weight = bool(example_weight_map.all_weight_features()) for feature_path, feature_array, weights in arrow_util.enumerate_arrays( record_batch, example_weight_map=example_weight_map, enumerate_leaves_only=True): feature_array_type = feature_array.type feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) if feature_path in bytes_features: continue if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( feature_array, weights is not None) if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = flattened_values.value_counts() values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() if has_any_weight: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), (count, 1)) else: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def test_topk_struct_leaves(self): batches = [ pa.RecordBatch.from_arrays([ pa.array([[1.0], [2.0]]), pa.array([[{ 'f1': ['a', 'b'], 'f2': [1, 2] }, { 'f1': ['b'], }], [{ 'f1': ['c', 'd'], 'f2': [2, 3] }, { 'f2': [3] }]]), ], ['w', 'c']), pa.RecordBatch.from_arrays([ pa.array([[3.0]]), pa.array([[{ 'f1': ['d'], 'f2': [4] }]]), ], ['w', 'c']), ] schema = text_format.Parse( """ feature { name: "c" type: STRUCT struct_domain { feature { name: "f2" type: INT int_domain { is_categorical: true } } } } """, schema_pb2.Schema()) expected_result = { types.FeaturePath(['c', 'f1']): text_format.Parse( """ string_stats { unique: 4 top_values { value: "d" frequency: 2.0 } top_values { value: "b" frequency: 2.0 } top_values { value: "c" frequency: 1.0 } rank_histogram { buckets { label: "d" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "c" sample_count: 1.0 } } weighted_string_stats { top_values { value: "d" frequency: 5.0 } top_values { value: "c" frequency: 2.0 } top_values { value: "b" frequency: 2.0 } rank_histogram { buckets { label: "d" sample_count: 5.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "b" sample_count: 2.0 } } } } path { step: "c" step: "f1" }""", statistics_pb2.FeatureNameStatistics()), types.FeaturePath(['c', 'f2']): text_format.Parse( """ string_stats { unique: 4 top_values { value: "3" frequency: 2.0 } top_values { value: "2" frequency: 2.0 } top_values { value: "4" frequency: 1.0 } rank_histogram { buckets { label: "3" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "2" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "4" sample_count: 1.0 } } weighted_string_stats { top_values { value: "3" frequency: 4.0 } top_values { value: "4" frequency: 3.0 } top_values { value: "2" frequency: 3.0 } rank_histogram { buckets { label: "3" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "4" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "2" sample_count: 3.0 } } } } path { step: "c" step: "f2" }""", statistics_pb2.FeatureNameStatistics()), } generator = sketch_generator.TopKUniquesSketchStatsGenerator( schema=schema, example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=3, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def _to_partial_copresence_counts( sliced_record_batch: types.SlicedRecordBatch, y_path: types.FeaturePath, x_paths: Iterable[types.FeaturePath], y_boundaries: Optional[np.ndarray], example_weight_map: ExampleWeightMap, num_xy_pairs_batch_copresent: Optional[ beam.metrics.metric.Metrics.DelegatingDistribution] = None ) -> Iterator[Tuple[_SlicedXYKey, _CountType]]: """Yields per-(slice, path_x, x, y) counts of examples with x and y. This method generates the number of times a given pair of y- and x-values appear in the same record, for a slice_key and x_path. Records in which either x or y is absent will be skipped. Args: sliced_record_batch: A tuple of (slice_key, record_batch) representing a slice of examples y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) / P(Y=y). x_paths: A set of x_paths for which to compute lift. y_boundaries: Optionally, a set of bin boundaries to use for binning y_path values. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. num_xy_pairs_batch_copresent: A counter tracking the number of different xy pairs that are copresent within each batch. If the same pair of xy values are copresent in more than one batch, this counter will be incremented once for each batch in which they are copresent. Yields: Tuples of the form (_SlicedXYKey(slice_key, x_path, x, y), count) for each combination of x_path, x, and y in the input record batch. """ slice_key, record_batch = sliced_record_batch y_presence = _get_example_value_presence(record_batch, y_path, y_boundaries, weight_column_name=None) if y_presence is None: return ys_by_example = collections.defaultdict(list) for example_index, y in zip(y_presence.example_indices, y_presence.values): ys_by_example[example_index].append(y) for x_path in x_paths: weight_column_name = example_weight_map.get(x_path) x_presence = _get_example_value_presence( record_batch, x_path, boundaries=None, weight_column_name=weight_column_name) if x_presence is None: continue if weight_column_name is not None: copresence_counts = collections.defaultdict(float) else: copresence_counts = collections.defaultdict(int) for example_index, x, weight in zip(x_presence.example_indices, x_presence.values, x_presence.weights): for y in ys_by_example[example_index]: copresence_counts[(x, y)] += weight if num_xy_pairs_batch_copresent: num_xy_pairs_batch_copresent.update(len(copresence_counts)) for (x, y), count in copresence_counts.items(): yield (_SlicedXYKey(slice_key=slice_key, x_path=x_path.steps(), x=x, y=y), count)
def test_topk_uniques_sketch_with_weights_custom_stats(self): # non-weighted ordering # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']]), pa.array([[5.0], [5.0]]), ], ['fa', 'w']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e']]), pa.array([[15.0]]), ], ['fa', 'w']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } custom_stats { name: 'topk_sketch_rank_histogram' rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } custom_stats { name: 'weighted_topk_sketch_rank_histogram' rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } custom_stats { name: 'uniques_sketch_num_uniques' num: 5 }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=4, num_rank_histogram_buckets=3, store_output_in_custom_stats=True) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_with_frequency_threshold(self): batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'y', 'b']]), pa.array([[5.0]]), ], ['fa', 'w']), pa.RecordBatch.from_arrays([ pa.array([['a', 'x', 'a', 'z']]), pa.array([[15.0]]), ], ['fa', 'w']) ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 3 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'a' frequency: 35.0 } top_values { value: 'z' frequency: 15.0 } top_values { value: 'x' frequency: 15.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 35.0 } buckets { low_rank: 1 high_rank: 1 label: "z" sample_count: 15.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 15.0 } } } }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=5, frequency_threshold=2, weighted_frequency_threshold=15, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_combiner_with_weights(self): # non-weighted ordering # fa: 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # fb: 1 'v', 1 'w', 1 'x', 1 'y', 1 'z' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' # fb: 6 'z', 4 'x', 4 'y', 4 'w', 2 'v' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']]), pa.array([['v'], ['w', 'x', 'y']]), pa.array([[5.0], [5.0]]), pa.array([[2.0], [4.0]]), ], ['fa', 'fb', 'w', 'w_b']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e']]), pa.array([['z']]), pa.array([[15.0]]), pa.array([[6.0]]), ], ['fa', 'fb', 'w', 'w_b']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } top_values { value: 'd' frequency: 2.0 } top_values { value: 'c' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'e' frequency: 20.0 } top_values { value: 'd' frequency: 20.0 } top_values { value: 'a' frequency: 15.0 } top_values { value: 'c' frequency: 10.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } }""", statistics_pb2.FeatureNameStatistics()), types.FeaturePath(['fb']): text_format.Parse( """ string_stats { unique: 5 top_values { value: "z" frequency: 1.0 } top_values { value: "y" frequency: 1.0 } top_values { value: "x" frequency: 1.0 } top_values { value: "w" frequency: 1.0 } rank_histogram { buckets { label: "z" sample_count: 1.0 } buckets { low_rank: 1 high_rank: 1 label: "y" sample_count: 1.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 1.0 } } weighted_string_stats { top_values { value: "z" frequency: 6.0 } top_values { value: "y" frequency: 4.0 } top_values { value: "x" frequency: 4.0 } top_values { value: "w" frequency: 4.0 } rank_histogram { buckets { label: "z" sample_count: 6.0 } buckets { low_rank: 1 high_rank: 1 label: "y" sample_count: 4.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 4.0 } } } } path { step: "fb" }""", statistics_pb2.FeatureNameStatistics()), } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap( weight_feature='w', per_feature_override={types.FeaturePath(['fb']): 'w_b'}), num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def test_topk_uniques_sketch_with_int_weights(self): # non-weighted ordering # 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' batches = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a']], type=pa.list_(pa.binary())), pa.array([[5], [5]], type=pa.list_(pa.int32())), ], ['fa', 'w']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e']], type=pa.list_(pa.binary())), pa.array([[15]], type=pa.list_(pa.int32())), ], ['fa', 'w']), ] expected_result = { types.FeaturePath(['fa']): text_format.Parse( """ path { step: 'fa' } string_stats { unique: 5 top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } top_values { value: 'd' frequency: 2.0 } top_values { value: 'c' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } weighted_string_stats { top_values { value: 'e' frequency: 20.0 } top_values { value: 'd' frequency: 20.0 } top_values { value: 'a' frequency: 15.0 } top_values { value: 'c' frequency: 10.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } }""", statistics_pb2.FeatureNameStatistics()) } generator = sketch_generator.TopKUniquesSketchStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=4, num_rank_histogram_buckets=3) self.assertCombinerOutputEqual(batches, generator, expected_result)
def enumerate_arrays( record_batch: pa.RecordBatch, example_weight_map: Optional[ExampleWeightMap], enumerate_leaves_only: bool, wrap_flat_struct_in_list: bool = True, ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Enumerates arrays in a RecordBatch. Define: primitive: primitive arrow arrays (e.g. Int64Array). nested_list := list<nested_list> | list<primitive> | null # note: a null array can be seen as a list<primitive>, which contains only # nulls and the type of the primitive is unknown. # example: # null, # list<null>, # like list<list<unknown_type>> with only null values. # list<list<int64>>, struct := struct<{field: nested_list | struct}> | list<struct> # example: # struct<{"foo": list<int64>}, # list<struct<{"foo": list<int64>}>>, # struct<{"foo": struct<{"bar": list<list<int64>>}>}> This function assumes `record_batch` contains only nested_list and struct columns. It enumerates each column in `record_batch`, and if that column is a struct, it flattens the outer lists wrapping it (if any), and recursively enumerates the array of each field in the struct (also see `enumerate_leaves_only`). The weights get "aligned" automatically in this process, therefore weights, the third term in the returned tuple always has enumerated_array[i]'s weight being weights[i]. A FeaturePath is included in the result to address the enumerated array. Note that the FeaturePath merely addresses in the `record_batch` and struct arrays. It does not indicate whether / how a struct array is nested. Args: record_batch: The RecordBatch whose arrays to be visited. example_weight_map: an ExampleWeightMap that maps a FeaturePath to its corresponding weight column. enumerate_leaves_only: If True, only enumerate leaf arrays. A leaf array is an array whose type does not have any struct nested in. Otherwise, also enumerate the struct arrays where the leaf arrays are contained. wrap_flat_struct_in_list: if True, and if a struct<[Ts]> array is encountered, it will be wrapped in a list array, so it becomes a list<struct<[Ts]>>, in which each sub-list contains one element. A caller can make use of this option to assume all the arrays enumerated here are list<inner_type>. Yields: A tuple. The first term is the path of the feature; the second term is the feature array and the third term is the weight array for the feature array (i.e. weights[i] is the weight for array[i]). Raises: ValueError: When the weight column is not a list array whose elements are 1-element lists. """ def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, all_weights: Dict[types.FeatureName, np.ndarray], ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type innermost_nested_type = get_innermost_nested_type(array_type) if pa.types.is_struct(innermost_nested_type): if not enumerate_leaves_only: weights = all_weights.get(example_weight_map.get(feature_path)) # special handing for a flat struct array -- wrap it in a ListArray # whose elements are singleton lists. This way downstream can keep # assuming the enumerated arrays are list<*>. to_yield = array if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: to_yield = array_util.ToSingletonListArray(array) yield (feature_path, to_yield, weights) flat_struct_array, parent_indices = flatten_nested( array, bool(all_weights)) # Potential optimization: # Only flatten weights that we know will be used in the recursion. flat_all_weights = { weight_feature_name: w[parent_indices] for weight_feature_name, w in all_weights.items() } for field in flat_struct_array.type: field_name = field.name yield from _recursion_helper( feature_path.child(field_name), flat_struct_array.field(field_name), flat_all_weights) else: weights = all_weights.get(example_weight_map.get(feature_path)) yield (feature_path, array, weights) if example_weight_map is None: example_weight_map = ExampleWeightMap( weight_feature=None, per_feature_override=None) all_weights = { weight_column: get_weight_feature(record_batch, weight_column) for weight_column in example_weight_map.all_weight_features() } for column_name, column in zip(record_batch.schema.names, record_batch.columns): yield from _recursion_helper( types.FeaturePath([column_name]), column, all_weights)
def test_topk_uniques_with_weights(self): # non-weighted ordering # fa: 3 'a', 2 'e', 2 'd', 2 'c', 1 'b' # fb: 1 'v', 1 'w', 1 'x', 1 'y', 1 'z' # weighted ordering # fa: 20 'e', 20 'd', 15 'a', 10 'c', 5 'b' # fb: 6 'z', 4 'x', 4 'y', 4 'w', 2 'v' examples = [ pa.RecordBatch.from_arrays([ pa.array([ ['a', 'b', 'c', 'e'], ['a', 'c', 'd', 'a'], ['d', 'e'], ]), pa.array([[5.0], [5.0], [15.0]]), pa.array([['v'], ['w', 'x', 'y'], ['z']]), pa.array([[2], [4], [6]]), ], ['fa', 'w', 'fb', 'w_b']) ] expected_result = [ text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { top_values { value: 'a' frequency: 3.0 } top_values { value: 'e' frequency: 2.0 } top_values { value: 'd' frequency: 2.0 } top_values { value: 'c' frequency: 2.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { type: STRING string_stats { top_values { value: "z" frequency: 1.0 } top_values { value: "y" frequency: 1.0 } top_values { value: "x" frequency: 1.0 } top_values { value: "w" frequency: 1.0 } rank_histogram { buckets { label: "z" sample_count: 1.0 } buckets { low_rank: 1 high_rank: 1 label: "y" sample_count: 1.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 1.0 } } } path { step: "fb" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { weighted_string_stats { top_values { value: 'e' frequency: 20.0 } top_values { value: 'd' frequency: 20.0 } top_values { value: 'a' frequency: 15.0 } top_values { value: 'c' frequency: 10.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 15.0 } } } } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { type: STRING string_stats { weighted_string_stats { top_values { value: "z" frequency: 6.0 } top_values { value: "y" frequency: 4.0 } top_values { value: "x" frequency: 4.0 } top_values { value: "w" frequency: 4.0 } rank_histogram { buckets { label: "z" sample_count: 6.0 } buckets { low_rank: 1 high_rank: 1 label: "y" sample_count: 4.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 4.0 } } } } path { step: "fb" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { unique: 5 } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { type: STRING string_stats { unique: 5 } path { step: "fb" } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator( example_weight_map=ExampleWeightMap( weight_feature='w', per_feature_override={types.FeaturePath(['fb']): 'w_b'}), num_top_values=4, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
def test_topk_uniques_with_struct_leaves(self): inputs = [ pa.RecordBatch.from_arrays([ pa.array([[1.0], [2.0]]), pa.array([[{ 'f1': ['a', 'b'], 'f2': [1, 2] }, { 'f1': ['b'], }], [{ 'f1': ['c', 'd'], 'f2': [2, 3] }, { 'f2': [3] }]]), ], ['w', 'c']), pa.RecordBatch.from_arrays([ pa.array([[3.0]]), pa.array([[{ 'f1': ['d'], 'f2': [4] }]]), ], ['w', 'c']), ] expected_result = [ text_format.Parse( """ features{ type: STRING string_stats { top_values { value: "d" frequency: 2.0 } top_values { value: "b" frequency: 2.0 } top_values { value: "c" frequency: 1.0 } rank_histogram { buckets { label: "d" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "c" sample_count: 1.0 } } } path { step: "c" step: "f1" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { string_stats { top_values { value: "3" frequency: 2.0 } top_values { value: "2" frequency: 2.0 } top_values { value: "4" frequency: 1.0 } rank_histogram { buckets { label: "3" sample_count: 2.0 } buckets { low_rank: 1 high_rank: 1 label: "2" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "4" sample_count: 1.0 } } } path { step: "c" step: "f2" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse(""" features { type: STRING string_stats { unique: 4 } path { step: "c" step: "f1" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse(""" features { type: INT string_stats { unique: 4 } path { step: "c" step: "f2" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse(""" features { type: STRING string_stats { weighted_string_stats { top_values { value: "d" frequency: 5.0 } top_values { value: "c" frequency: 2.0 } top_values { value: "b" frequency: 2.0 } rank_histogram { buckets { label: "d" sample_count: 5.0 } buckets { low_rank: 1 high_rank: 1 label: "c" sample_count: 2.0 } buckets { low_rank: 2 high_rank: 2 label: "b" sample_count: 2.0 } } } } path { step: "c" step: "f1" } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse(""" features { string_stats { weighted_string_stats { top_values { value: "3" frequency: 4.0 } top_values { value: "4" frequency: 3.0 } top_values { value: "2" frequency: 3.0 } rank_histogram { buckets { label: "3" sample_count: 4.0 } buckets { low_rank: 1 high_rank: 1 label: "4" sample_count: 3.0 } buckets { low_rank: 2 high_rank: 2 label: "2" sample_count: 3.0 } } } } path { step: "c" step: "f2" } }""", statistics_pb2.DatasetFeatureStatistics()), ] schema = text_format.Parse( """ feature { name: "c" type: STRUCT struct_domain { feature { name: "f2" type: INT int_domain { is_categorical: true } } } } """, schema_pb2.Schema()) generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator( schema=schema, example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=3, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual( inputs, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)
pa.array([ { "sf1": [[1, 2], [3]], "sf2": [None], }, None, ]), pa.array([[1], [2]]), pa.array([[2], [4]]), pa.array([[6], [8]]), ], ["f1", "f2", "f3", "w", "w_override1", "w_override2"]) _EXAMPLE_WEIGHT_MAP = ExampleWeightMap( weight_feature="w", per_feature_override={ types.FeaturePath(["f2"]): "w_override1", types.FeaturePath(["f2", "sf1"]): "w_override2", types.FeaturePath(["f2", "sf2"]): "w_override2", types.FeaturePath(["f2", "sf2", "ssf1"]): "w_override1", }) ExpectedArray = tfx_namedtuple.namedtuple( "ExpectedArray", ["array", "parent_indices", "weights"]) _FEATURES_TO_ARRAYS = { types.FeaturePath(["f1"]): ExpectedArray(pa.array([[1], [2, 3]]), [0, 1], [1, 2]), types.FeaturePath(["w"]): ExpectedArray(pa.array([[1], [2]]), [0, 1], [1, 2]), types.FeaturePath(["w_override1"]): ExpectedArray(pa.array([[2], [4]]), [0, 1], [1, 2]), types.FeaturePath(["w_override2"]): ExpectedArray(pa.array([[6], [8]]), [0, 1], [1, 2]),
def test_topk_uniques_with_frequency_threshold(self): examples = [ pa.RecordBatch.from_arrays([ pa.array([['a', 'b', 'y', 'b'], ['a', 'x', 'a', 'z']]), pa.array([[5.0], [15.0]]) ], ['fa', 'w']) ] expected_result = [ text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { top_values { value: 'a' frequency: 3 } top_values { value: 'b' frequency: 2 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 } buckets { low_rank: 1 high_rank: 1 label: "b" sample_count: 2.0 } } } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { weighted_string_stats { top_values { value: 'a' frequency: 35.0 } top_values { value: 'z' frequency: 15.0 } top_values { value: 'x' frequency: 15.0 } rank_histogram { buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 35.0 } buckets { low_rank: 1 high_rank: 1 label: "z" sample_count: 15.0 } buckets { low_rank: 2 high_rank: 2 label: "x" sample_count: 15.0 } } } } }""", statistics_pb2.DatasetFeatureStatistics()), text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { unique: 5 } }""", statistics_pb2.DatasetFeatureStatistics()), ] generator = top_k_uniques_stats_generator.TopKUniquesStatsGenerator( example_weight_map=ExampleWeightMap(weight_feature='w'), num_top_values=5, frequency_threshold=2, weighted_frequency_threshold=15, num_rank_histogram_buckets=3) self.assertSlicingAwareTransformOutputEqual( examples, generator, expected_result, add_default_slice_key_to_input=True, add_default_slice_key_to_output=True)