Example #1
0
  def test_get_binary_array_total_byte_size(self, binary_like_type):
    array = pa.array([b"abc", None, b"def", b"", b"ghi"], type=binary_like_type)
    self.assertEqual(9, array_util.GetBinaryArrayTotalByteSize(array))
    sliced_1_2 = array.slice(1, 2)
    self.assertEqual(3, array_util.GetBinaryArrayTotalByteSize(sliced_1_2))
    sliced_2 = array.slice(2)
    self.assertEqual(6, array_util.GetBinaryArrayTotalByteSize(sliced_2))

    empty_array = pa.array([], type=binary_like_type)
    self.assertEqual(0, array_util.GetBinaryArrayTotalByteSize(empty_array))
Example #2
0
    def test_get_binary_array_total_byte_size(self):
        binary_array = pa.array([b"abc", None, b"def", b"", b"ghi"])
        self.assertEqual(9,
                         array_util.GetBinaryArrayTotalByteSize(binary_array))
        sliced_1_2 = binary_array.slice(1, 2)
        self.assertEqual(3, array_util.GetBinaryArrayTotalByteSize(sliced_1_2))
        sliced_2 = binary_array.slice(2)
        self.assertEqual(6, array_util.GetBinaryArrayTotalByteSize(sliced_2))

        unicode_array = pa.array([u"abc"])
        self.assertEqual(3,
                         array_util.GetBinaryArrayTotalByteSize(unicode_array))

        empty_array = pa.array([], type=pa.binary())
        self.assertEqual(0,
                         array_util.GetBinaryArrayTotalByteSize(empty_array))
 def update(self, feature_array: pa.Array) -> None:
   """Update the partial string statistics using the input value."""
   if pa.types.is_null(feature_array.type):
     return
   # Iterate through the value array and update the partial stats.
   flattened_values_array, _ = arrow_util.flatten_nested(feature_array)
   if arrow_util.is_binary_like(flattened_values_array.type):
     # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
     # with Python3). To make sure we do cheaper integer arithemetics in
     # Python2, we first convert it to int.
     self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize(
         flattened_values_array))
   elif flattened_values_array:
     # We can only do flattened_values_array.to_numpy() when it's not empty.
     # This could be computed faster by taking log10 of the integer.
     def _len_after_conv(s):
       return len(str(s))
     self.total_bytes_length += np.sum(
         np.vectorize(_len_after_conv,
                      otypes=[np.int32])(np.asarray(flattened_values_array)))