def test_value_counts_binary(self): binary_array = pa.array( [b"abc", b"ghi", b"def", b"ghi", b"ghi", b"def"]) expected_result = {b"abc": 1, b"ghi": 3, b"def": 2} self.assertDictEqual( self._value_counts_struct_array_to_dict( arrow_util.ValueCounts(binary_array)), expected_result)
def _to_topk_tuples( sliced_table: Tuple[Text, pa.Table], categorical_features: FrozenSet[types.FeaturePath], weight_feature: Optional[Text] ) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[ int, Union[int, float]]]]]: """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table for feature_path, feature_array, weights in arrow_util.enumerate_arrays( table, weight_column=weight_feature, enumerate_leaves_only=True): feature_array_type = feature_array.type if (feature_path in categorical_features or stats_util.get_feature_type_from_arrow_type( feature_path, feature_array_type) == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = feature_array.flatten() if weights is not None and flattened_values: # Slow path: weighted uniques. flattened_values_np = arrow_util.primitive_array_to_numpy( flattened_values) parent_indices = (arrow_util.primitive_array_to_numpy( arrow_util.GetFlattenedArrayParentIndices(feature_array))) weights_ndarray = weights[parent_indices] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def add_input(self, accumulator, input_table): weight_column = (input_table.column(self._weight_feature) if self._weight_feature else None) weight_array = weight_column.data.chunk(0) if weight_column else [] if weight_array: flattened_weights = arrow_util.FlattenListArray( weight_array).to_numpy() for column in input_table.columns: feature_name = column.name # Skip the weight feature. if feature_name == self._weight_feature: continue feature_path = types.FeaturePath([feature_name]) feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, column.type) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if not (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): continue value_array = column.data.chunk(0) flattened_values = arrow_util.FlattenListArray(value_array) unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = arrow_util.ValueCounts(flattened_values) for value_count in value_counts: value_count = value_count.as_py() unweighted_counts[ value_count['values']] = value_count['counts'] # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weight_array: if (pa.types.is_binary(flattened_values.type) or pa.types.is_string(flattened_values.type)): # no free conversion. flattened_values_np = flattened_values.to_pandas() else: flattened_values_np = flattened_values.to_numpy() indices = arrow_util.GetFlattenedArrayParentIndices( value_array) weighted_counts.weighted_update( flattened_values_np, flattened_weights[indices.to_numpy()]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def add_input( self, accumulator: Dict[types.FeaturePath, _ValueCounts], input_table: pa.Table) -> Dict[types.FeaturePath, _ValueCounts]: for feature_path, leaf_array, weights in arrow_util.enumerate_arrays( input_table, weight_column=self._weight_feature, enumerate_leaves_only=True): feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, leaf_array.type) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if (feature_path in self._categorical_features or feature_type == statistics_pb2.FeatureNameStatistics.STRING): flattened_values = leaf_array.flatten() unweighted_counts = collections.Counter() # Compute unweighted counts. value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): unweighted_counts[value] = count # Compute weighted counts if a weight feature is specified. weighted_counts = _WeightedCounter() if weights is not None: flattened_values_np = arrow_util.primitive_array_to_numpy( flattened_values) parent_indices = arrow_util.GetFlattenedArrayParentIndices( leaf_array) weighted_counts.weighted_update( flattened_values_np, weights[arrow_util.primitive_array_to_numpy( parent_indices)]) if feature_path not in accumulator: accumulator[feature_path] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_path].unweighted_counts.update( unweighted_counts) accumulator[feature_path].weighted_counts.update( weighted_counts) return accumulator
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None): """Generates tuples for computing top-k and uniques from input tables.""" slice_key, table = sliced_table weight_column = table.column(weight_feature) if weight_feature else None weight_array = weight_column.data.chunk(0) if weight_column else [] if weight_array: flattened_weights = arrow_util.FlattenListArray( weight_array).to_numpy() for feature_column in table.columns: feature_name = feature_column.name # Skip the weight feature. if feature_name == weight_feature: continue feature_path = types.FeaturePath([feature_name]) # if it's not a categorical feature nor a string feature, we don't bother # with topk stats. if not (feature_path in categorical_features or feature_column.type.equals(pa.list_(pa.binary())) or feature_column.type.equals(pa.list_(pa.string()))): continue value_array = feature_column.data.chunk(0) flattened_values = arrow_util.FlattenListArray(value_array) if weight_array and flattened_values: if (pa.types.is_binary(flattened_values.type) or pa.types.is_string(flattened_values.type)): # no free conversion. flattened_values_np = flattened_values.to_pandas() else: flattened_values_np = flattened_values.to_numpy() indices = arrow_util.GetFlattenedArrayParentIndices(value_array) weights_ndarray = flattened_weights[indices.to_numpy()] for value, count, weight in _weighted_unique( flattened_values_np, weights_ndarray): yield (slice_key, feature_path.steps(), value), (count, weight) else: value_counts = arrow_util.ValueCounts(flattened_values) values = value_counts.field('values').to_pylist() counts = value_counts.field('counts').to_pylist() for value, count in six.moves.zip(values, counts): yield ((slice_key, feature_path.steps(), value), count)
def test_value_counts_empty(self): empty_array = pa.array([]) expected_result = {} self.assertDictEqual( self._value_counts_struct_array_to_dict( arrow_util.ValueCounts(empty_array)), expected_result)
def test_value_counts_integer(self): int_array = pa.array([1, 4, 1, 3, 1, 4]) expected_result = {1: 3, 4: 2, 3: 1} self.assertDictEqual( self._value_counts_struct_array_to_dict( arrow_util.ValueCounts(int_array)), expected_result)