def test_get_binary_array_total_byte_size(self, binary_like_type): array = pa.array([b"abc", None, b"def", b"", b"ghi"], type=binary_like_type) self.assertEqual(9, array_util.GetBinaryArrayTotalByteSize(array)) sliced_1_2 = array.slice(1, 2) self.assertEqual(3, array_util.GetBinaryArrayTotalByteSize(sliced_1_2)) sliced_2 = array.slice(2) self.assertEqual(6, array_util.GetBinaryArrayTotalByteSize(sliced_2)) empty_array = pa.array([], type=binary_like_type) self.assertEqual(0, array_util.GetBinaryArrayTotalByteSize(empty_array))
def test_get_binary_array_total_byte_size(self): binary_array = pa.array([b"abc", None, b"def", b"", b"ghi"]) self.assertEqual(9, array_util.GetBinaryArrayTotalByteSize(binary_array)) sliced_1_2 = binary_array.slice(1, 2) self.assertEqual(3, array_util.GetBinaryArrayTotalByteSize(sliced_1_2)) sliced_2 = binary_array.slice(2) self.assertEqual(6, array_util.GetBinaryArrayTotalByteSize(sliced_2)) unicode_array = pa.array([u"abc"]) self.assertEqual(3, array_util.GetBinaryArrayTotalByteSize(unicode_array)) empty_array = pa.array([], type=pa.binary()) self.assertEqual(0, array_util.GetBinaryArrayTotalByteSize(empty_array))
def update(self, feature_array: pa.Array) -> None: """Update the partial string statistics using the input value.""" if pa.types.is_null(feature_array.type): return # Iterate through the value array and update the partial stats. flattened_values_array, _ = arrow_util.flatten_nested(feature_array) if arrow_util.is_binary_like(flattened_values_array.type): # GetBinaryArrayTotalByteSize returns a Python long (to be compatible # with Python3). To make sure we do cheaper integer arithemetics in # Python2, we first convert it to int. self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize( flattened_values_array)) elif flattened_values_array: # We can only do flattened_values_array.to_numpy() when it's not empty. # This could be computed faster by taking log10 of the integer. def _len_after_conv(s): return len(str(s)) self.total_bytes_length += np.sum( np.vectorize(_len_after_conv, otypes=[np.int32])(np.asarray(flattened_values_array)))