def _get_data_buffer( self, ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. """ if self.dtype[0] in ( DtypeKind.INT, DtypeKind.UINT, DtypeKind.FLOAT, DtypeKind.BOOL, DtypeKind.DATETIME, ): buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: codes = self._col.values.codes buffer = PandasBuffer(codes, allow_copy=self._allow_copy) dtype = self._dtype_from_pandasdtype(codes.dtype) elif self.dtype[0] == DtypeKind.STRING: # Marshal the strings from a NumPy object array into a byte array buf = self._col.to_numpy() b = bytearray() # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later for obj in buf: if isinstance(obj, str): b.extend(obj.encode(encoding="utf-8")) # Convert the byte array to a Pandas "buffer" using # a NumPy array as the backing store buffer = PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer dtype = ( DtypeKind.STRING, 8, ArrowCTypes.STRING, Endianness.NATIVE, ) # note: currently only support native endianness else: raise NotImplementedError( f"Data type {self._col.dtype} not handled yet") return buffer, dtype
def _get_offsets_buffer(self) -> Tuple[PandasBuffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. Raises NoBufferPresent if the data buffer does not have an associated offsets buffer. """ if self.dtype[0] == DtypeKind.STRING: # For each string, we need to manually determine the next offset values = self._col.to_numpy() ptr = 0 offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64) for i, v in enumerate(values): # For missing values (in this case, `np.nan` values) # we don't increment the pointer if isinstance(v, str): b = v.encode(encoding="utf-8") ptr += len(b) offsets[i + 1] = ptr # Convert the offsets to a Pandas "buffer" using # the NumPy array as the backing store buffer = PandasBuffer(offsets) # Assemble the buffer dtype info dtype = ( DtypeKind.INT, 64, ArrowCTypes.INT64, Endianness.NATIVE, ) # note: currently only support native endianness else: raise NoBufferPresent( "This column has a fixed-length dtype so " "it does not have an offsets buffer" ) return buffer, dtype
def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. Raises NoBufferPresent if null representation is not a bit or byte mask. """ null, invalid = self.describe_null if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. # TODO: maybe store as bit array to save space?.. buf = self._col.to_numpy() # Determine the encoding for valid values valid = invalid == 0 invalid = not valid mask = np.zeros(shape=(len(buf), ), dtype=np.bool8) for i, obj in enumerate(buf): mask[i] = valid if isinstance(obj, str) else invalid # Convert the mask array to a Pandas "buffer" using # a NumPy array as the backing store buffer = PandasBuffer(mask) # Define the dtype of the returned buffer dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) return buffer, dtype try: msg = _NO_VALIDITY_BUFFER[ null] + " so does not have a separate mask" except KeyError: # TODO: implement for other bit/byte masks? raise NotImplementedError("See self.describe_null") raise NoBufferPresent(msg)