def __from_arrow__( self, array: Union[pyarrow.Array, pyarrow.ChunkedArray]) -> BaseMaskedArray: """ Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. """ import pyarrow from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask array_class = self.construct_array_type() pyarrow_type = pyarrow.from_numpy_dtype(self.type) if not array.type.equals(pyarrow_type): array = array.cast(pyarrow_type) if isinstance(array, pyarrow.Array): chunks = [array] else: # pyarrow.ChunkedArray chunks = array.chunks results = [] for arr in chunks: data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) if len(results) == 1: # avoid additional copy in _concat_same_type return results[0] else: return array_class._concat_same_type(results)
def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]) -> "IntegerArray": """ Construct IntegerArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) if not array.type.equals(pyarrow_type): array = array.cast(pyarrow_type) if isinstance(array, pyarrow.Array): chunks = [array] else: # pyarrow.ChunkedArray chunks = array.chunks results = [] for arr in chunks: data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) int_arr = IntegerArray(data.copy(), ~mask, copy=False) results.append(int_arr) return IntegerArray._concat_same_type(results)
def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray) -> PeriodArray: """ Construct PeriodArray from pyarrow Array/ChunkedArray. """ import pyarrow from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask if isinstance(array, pyarrow.Array): chunks = [array] else: chunks = array.chunks results = [] for arr in chunks: data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=np.dtype( np.int64)) parr = PeriodArray(data.copy(), freq=self.freq, copy=False) parr[~mask] = NaT results.append(parr) if not results: return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False) return PeriodArray._concat_same_type(results)
def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseMaskedArray: """ Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. """ import pyarrow from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask array_class = self.construct_array_type() pyarrow_type = pyarrow.from_numpy_dtype(self.type) if not array.type.equals(pyarrow_type): # test_from_arrow_type_error raise for string, but allow # through itemsize conversion GH#31896 rt_dtype = pandas_dtype(array.type.to_pandas_dtype()) if rt_dtype.kind not in ["i", "u", "f"]: # Could allow "c" or potentially disallow float<->int conversion, # but at the moment we specifically test that uint<->int works raise TypeError( f"Expected array of {self} type, got {array.type} instead" ) array = array.cast(pyarrow_type) if isinstance(array, pyarrow.Array): chunks = [array] else: # pyarrow.ChunkedArray chunks = array.chunks results = [] for arr in chunks: data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) if not results: return array_class( np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) ) elif len(results) == 1: # avoid additional copy in _concat_same_type return results[0] else: return array_class._concat_same_type(results)
def __from_arrow__( self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]) -> "PeriodArray": """ Construct PeriodArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask if isinstance(array, pyarrow.Array): chunks = [array] else: chunks = array.chunks results = [] for arr in chunks: data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") parr = PeriodArray(data.copy(), freq=self.freq, copy=False) parr[~mask] = NaT results.append(parr) return PeriodArray._concat_same_type(results)
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): """ Test conversion from pyarrow array to numpy array. Modifies the pyarrow buffer to contain padding and offset, which are considered valid buffers by pyarrow. Also tests empty pyarrow arrays with non empty buffers. See https://github.com/pandas-dev/pandas/issues/40896 """ np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) tm.assert_numpy_array_equal(data[:3], np_expected) tm.assert_numpy_array_equal(mask, mask_expected) mask_buffer = pa_array.buffers()[0] data_buffer = pa_array.buffers()[1] data_buffer_bytes = pa_array.buffers()[1].to_pybytes() # Add trailing padding to the buffer. data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") pa_array_trail = pa.Array.from_buffers( type=pa_array.type, length=len(pa_array), buffers=[mask_buffer, data_buffer_trail], offset=pa_array.offset, ) pa_array_trail.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) tm.assert_numpy_array_equal(data[:3], np_expected) tm.assert_numpy_array_equal(mask, mask_expected) # Add offset to the buffer. offset = b"\x00" * (pa_array.type.bit_width // 8) data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) mask_buffer_offset = pa.py_buffer(b"\x0E") pa_array_offset = pa.Array.from_buffers( type=pa_array.type, length=len(pa_array), buffers=[mask_buffer_offset, data_buffer_offset], offset=pa_array.offset + 1, ) pa_array_offset.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) tm.assert_numpy_array_equal(data[:3], np_expected) tm.assert_numpy_array_equal(mask, mask_expected) # Empty array np_expected_empty = np.array([], dtype=np_dtype) mask_expected_empty = np.array([], dtype=np.bool_) pa_array_offset = pa.Array.from_buffers( type=pa_array.type, length=0, buffers=[mask_buffer, data_buffer], offset=pa_array.offset, ) pa_array_offset.validate() data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) tm.assert_numpy_array_equal(data[:3], np_expected_empty) tm.assert_numpy_array_equal(mask, mask_expected_empty)