def test_value_counts_binary(self): binary_array = pa.array( [b"abc", b"ghi", b"def", b"ghi", b"ghi", b"def"]) expected_result = {b"abc": 1, b"ghi": 3, b"def": 2} self.assertDictEqual( self._value_counts_struct_array_to_dict( array_util.ValueCounts(binary_array)), expected_result)
def _LargeBinaryCanBeValueCounted() -> bool: """Returns True if a large binary array can be value counted.""" try: array_util.ValueCounts(pa.array([], type=pa.large_binary())) except: # pylint:disable=bare-except return False return True
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_record_batch: pa.RecordBatch ) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_record_batch, weight_column=self._weight_feature, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) if feature_type is None: continue # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = leaf_array.flatten() unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = np.asarray(flattened_values) parent_indices = array_util.GetFlattenedArrayParentIndices( leaf_array) weighted_counts.weighted_update( flattened_values_np, weights[np.asarray(parent_indices)]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def _to_topk_tuples( sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], example_weight_map: ExampleWeightMap, ) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[ int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from the input.""" slice_key, record_batch = sliced_record_batch has_any_weight = bool(example_weight_map.all_weight_features()) for feature_path, feature_array, weights in arrow_util.enumerate_arrays( record_batch, example_weight_map=example_weight_map, enumerate_leaves_only=True): feature_array_type = feature_array.type feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) if feature_path in bytes_features: continue if ((feature_type == statistics_pb2.FeatureNameStatistics.INT and feature_path in categorical_features) or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values, parent_indices = arrow_util.flatten_nested( feature_array, weights is not None) if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() if has_any_weight: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), (count, 1)) else: for value, count in zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def _to_topk_tuples( sliced_table: Tuple[types.SliceKey, pa.Table], bytes_features: FrozenSet[types.FeaturePath], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[ Tuple[Tuple[types.SliceKey, FeaturePathTuple, Any], Union[int, Tuple[int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table for feature_path, feature_array, weights in arrow_util.enumerate_arrays( table, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type if pa.types.is_null(feature_array_type): continue if feature_path in bytes_features: continue if (feature_path in categorical_features or stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = feature_array.flatten() if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = np.asarray(flattened_values) parent_indices = ( np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array))) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = array_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def test_value_counts_empty(self): empty_array = pa.array([]) expected_result = {} self.assertDictEqual(self._value_counts_struct_array_to_dict( array_util.ValueCounts(empty_array)), expected_result)
def test_value_counts_integer(self): int_array = pa.array([1, 4, 1, 3, 1, 4]) expected_result = {1: 3, 4: 2, 3: 1} self.assertDictEqual(self._value_counts_struct_array_to_dict( array_util.ValueCounts(int_array)), expected_result)