def update(self, feature_column, feature_type, num_values_quantiles_combiner, weight_column=None): """Update the partial common statistics using the input value.""" # All the values in this column is null and we cannot deduce the type of # the feature. This is not an error as this feature might have some values # in other batches. if feature_type is None: return if self.type is None: self.type = feature_type elif self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_column.name, self.type, feature_type)) # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_column: return if weight_column and (feature_column.data.num_chunks != weight_column.data.num_chunks): raise ValueError( 'Expected the feature column {} and weight column {} to have the ' 'same number of chunks.'.format(feature_column.name, weight_column.name)) weight_chunks = weight_column.data.iterchunks( ) if weight_column else [] for feature_array, weight_array in six.moves.zip_longest( feature_column.data.iterchunks(), weight_chunks, fillvalue=None): num_values = arrow_util.ListLengthsFromListArray( feature_array).to_numpy() none_mask = arrow_util.GetArrayNullBitmapAsByteArray( feature_array).to_numpy().view(np.bool) num_values_not_none = num_values[~none_mask] self.num_non_missing += len( feature_array) - feature_array.null_count self.max_num_values = max(np.max(num_values_not_none), self.max_num_values) self.min_num_values = min(np.min(num_values_not_none), self.min_num_values) self.total_num_values += np.sum(num_values_not_none) self.num_values_summary = num_values_quantiles_combiner.add_input( self.num_values_summary, [num_values_not_none]) if weight_array: weights = (arrow_util.FlattenListArray( weight_array).to_numpy().astype(np.float32, copy=False)) if weights.size != num_values.size: raise ValueError('Weight feature must not be missing.') self.weighted_total_num_values += np.sum(num_values * weights) self.weighted_num_non_missing += np.sum(weights[~none_mask])
def test_get_array_null_bitmap_as_byte_array(self): array = pa.array([], type=pa.int32()) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue(null_masks.equals(pa.array([], type=pa.uint8()))) array = pa.array([1, 2, None, 3, None], type=pa.int32()) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue( null_masks.equals(pa.array([0, 0, 1, 0, 1], type=pa.uint8()))) array = pa.array([1, 2, 3]) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue(null_masks.equals(pa.array([0, 0, 0], type=pa.uint8()))) array = pa.array([None, None, None], type=pa.int32()) null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array) self.assertTrue(null_masks.equals(pa.array([1, 1, 1], type=pa.uint8()))) # Demonstrate that the returned array can be converted to a numpy boolean # array w/o copying np.testing.assert_equal( np.array([True, True, True]), null_masks.to_numpy().view(np.bool))
def update(self, feature_path: types.FeaturePath, feature_array: pa.Array, feature_type: types.FeatureNameStatisticsType, num_values_quantiles_combiner: Any, weights: Optional[np.ndarray] = None) -> None: """Update the partial common statistics using the input value.""" # All the values in this column is null and we cannot deduce the type of # the feature. This is not an error as this feature might have some values # in other batches. if feature_type is None: return if self.type is None: self.type = feature_type elif self.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_path, self.type, feature_type)) # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_array: return num_values = arrow_util.primitive_array_to_numpy( arrow_util.ListLengthsFromListArray(feature_array)) none_mask = arrow_util.primitive_array_to_numpy( arrow_util.GetArrayNullBitmapAsByteArray(feature_array)).view( np.bool) self.num_non_missing += len(feature_array) - feature_array.null_count num_values_not_none = num_values[~none_mask] # We do this check to avoid failing in np.min/max with empty array. if num_values_not_none.size == 0: return # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values) # once we upgrade to numpy 1.16 self.max_num_values = max(np.max(num_values_not_none), self.max_num_values) self.min_num_values = min(np.min(num_values_not_none), self.min_num_values) self.total_num_values += np.sum(num_values_not_none) self.num_values_summary = num_values_quantiles_combiner.add_input( self.num_values_summary, [num_values_not_none]) if weights is not None: if weights.size != num_values.size: raise ValueError('Weight feature must not be missing.') self.weighted_total_num_values += np.sum(num_values * weights) self.weighted_num_non_missing += np.sum(weights[~none_mask])