def take(self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None) -> "TensorArray": """ See docstring in :class:`ExtensionArray` class in ``pandas/core/arrays/base.py`` for information about this method. """ if allow_fill: # From API docs: "[If allow_fill == True, then] negative values in # `indices` indicate missing values and are set to `fill_value` indices = np.asarray(indices, dtype=np.intp) validate_indices(indices, len(self._tensor)) # Check if there are missing indices to fill, if not can use numpy take below has_missing = np.any(indices < 0) if has_missing: if fill_value is None: fill_value = np.nan # Create an array populated with fill value values = np.full((len(indices), ) + self._tensor.shape[1:], fill_value) # Iterate over each index and set non-missing elements for i, idx in enumerate(indices): if idx >= 0: values[i] = self._tensor[idx] return TensorArray(values) # Delegate take to numpy array values = self._tensor.take(indices, axis=0) return TensorArray(values)
def take(self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None) -> "TensorArray": """ Take elements from an array. Parameters ---------- indices : sequence of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. Returns ------- ExtensionArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. See Also -------- numpy.take : Take elements from an array along an axis. api.extensions.take : Take elements from an array. Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignment, with a `fill_value`. Examples -------- Here's an example implementation, which relies on casting the extension array to object dtype. This uses the helper method :func:`pandas.api.extensions.take`. .. code-block:: python def take(self, indices, allow_fill=False, fill_value=None): from pandas.core.algorithms import take # If the ExtensionArray is backed by an ndarray, then # just pass that here instead of coercing to object. data = self.astype(object) if allow_fill and fill_value is None: fill_value = self.dtype.na_value # fill value should always be translated from the scalar # type for the array, to the physical storage type for # the data, before passing to take. result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result, dtype=self.dtype) """ if allow_fill: # With allow_fill being True, negative values in `indices` indicate # missing values and should be set to `fill_value`. indices = np.asarray(indices, dtype=np.intp) validate_indices(indices, len(self._tensor)) # Check if there are missing indices to fill, otherwise we can # delegate to NumPy ndarray .take(). has_missing = np.any(indices < 0) if has_missing: if fill_value is None: fill_value = np.nan # Create an array populated with fill value. values = np.full((len(indices), ) + self._tensor.shape[1:], fill_value) # Put tensors at the given positive indices into array. is_nonneg = indices >= 0 np.put(values, np.where(is_nonneg)[0], self._tensor[indices[is_nonneg]]) return TensorArray(values) # Delegate take to NumPy array. values = self._tensor.take(indices, axis=0) return TensorArray(values)
def _reindex_indexer( self: T, new_axis, indexer, axis: int, fill_value=None, allow_dups: bool = False, copy: bool = True, use_na_proxy: bool = False, ) -> T: """ Parameters ---------- new_axis : Index indexer : ndarray of int64 or None axis : int fill_value : object, default None allow_dups : bool, default False copy : bool, default True pandas-indexer with -1's only. """ if indexer is None: if new_axis is self._axes[axis] and not copy: return self result = self.copy(deep=copy) result._axes = list(self._axes) result._axes[axis] = new_axis return result # some axes don't allow reindexing with dups if not allow_dups: self._axes[axis]._validate_can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 1: new_arrays = [] for i in indexer: if i == -1: arr = self._make_na_array(fill_value=fill_value, use_na_proxy=use_na_proxy) else: arr = self.arrays[i] new_arrays.append(arr) else: validate_indices(indexer, len(self._axes[0])) indexer = ensure_platform_int(indexer) if (indexer == -1).any(): allow_fill = True else: allow_fill = False new_arrays = [ take_1d( arr, indexer, allow_fill=allow_fill, fill_value=fill_value, # if fill_value is not None else blk.fill_value ) for arr in self.arrays ] new_axes = list(self._axes) new_axes[axis] = new_axis return type(self)(new_arrays, new_axes, verify_integrity=False)
def take(self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None): """ Take elements from an array. Parameters ---------- indices : sequence of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. Returns ------- ExtensionArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. See Also -------- numpy.take api.extensions.take Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignment, with a `fill_value`. """ # TODO: Remove once we got rid of the (indices < 0) check if not is_array_like(indices): indices_array = np.asanyarray(indices) else: # error: Incompatible types in assignment (expression has type # "Sequence[int]", variable has type "ndarray") indices_array = indices # type: ignore[assignment] if len(self._data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") if indices_array.size > 0 and indices_array.max() >= len(self._data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: fill_mask = indices_array < 0 if fill_mask.any(): validate_indices(indices_array, len(self._data)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=fill_mask) result = self._data.take(indices_array) if isna(fill_value): return type(self)(result) # TODO: ArrowNotImplementedError: Function fill_null has no # kernel matching input types (array[string], scalar[string]) result = type(self)(result) result[fill_mask] = fill_value return result # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self._data.take(indices)) else: # allow_fill=False # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): # Don't modify in-place indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array))
def test_validate_indices_empty(): with pytest.raises(IndexError, match="indices are out"): validate_indices(np.array([0, 1]), 0)
def test_validate_indices_high(): indices = np.asarray([0, 1, 2]) with pytest.raises(IndexError, match="indices are out"): validate_indices(indices, 2)
def test_validate_indices_low(): indices = np.asarray([0, -2]) with pytest.raises(ValueError, match="'indices' contains"): validate_indices(indices, 2)
def test_validate_indices_ok(): indices = np.asarray([0, 1]) validate_indices(indices, 2) validate_indices(indices[:0], 0) validate_indices(np.array([-1, -1]), 0)
def _reindex_indexer( self: T, new_axis, indexer, axis: int, fill_value=None, allow_dups: bool = False, copy: bool = True, ) -> T: """ Parameters ---------- new_axis : Index indexer : ndarray of int64 or None axis : int fill_value : object, default None allow_dups : bool, default False copy : bool, default True pandas-indexer with -1's only. """ if indexer is None: if new_axis is self._axes[axis] and not copy: return self result = self.copy(deep=copy) result._axes = list(self._axes) result._axes[axis] = new_axis return result # some axes don't allow reindexing with dups if not allow_dups: self._axes[axis]._validate_can_reindex(indexer) if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 1: new_arrays = [] for i in indexer: if i == -1: arr = self._make_na_array(fill_value=fill_value) else: arr = self.arrays[i] new_arrays.append(arr) else: validate_indices(indexer, len(self._axes[0])) new_arrays = [ # error: Value of type variable "ArrayLike" of "take_1d" cannot be # "Union[ndarray, ExtensionArray]" [type-var] take_1d( # type: ignore[type-var] arr, indexer, allow_fill=True, fill_value=fill_value, # if fill_value is not None else blk.fill_value ) for arr in self.arrays ] new_axes = list(self._axes) new_axes[axis] = new_axis return type(self)(new_arrays, new_axes, verify_integrity=False)
def take(self, indices, allow_fill=False, fill_value=None): # type: (Sequence[int] , bool, Optional[Any]) -> FletcherArray """ Take elements from an array. Parameters ---------- indices : sequence of integers Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many FletcherArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if nescessary. Returns ------- FletcherArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignemnt, with a `fill_value`. Notes ----- FletcherArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignemnt, with a `fill_value`. See Also -------- numpy.take pandas.api.extensions.take """ threshold_ratio = 0.3 # this is the threshold to decide whether or not to concat everything first. # Benchmarks were made on string, int32, int64, float32, float64 and it turns out that 0.3 is the value where it # is best to switch to concatening everything first, both time-wise and memory-wise length = len(self) indices = np.asarray(indices, dtype=self._indices_dtype) has_negative_indices = np.any(indices < 0) # type: ignore allow_fill &= has_negative_indices if allow_fill: validate_indices(indices, length) if (has_negative_indices and not allow_fill) or np.any(indices >= length # type: ignore ): # this will raise IndexError expected by pandas in all needed cases indices = np.arange(length, dtype=self._indices_dtype).take(indices) # here we guarantee that indices is numpy array of ints # and we have checked that all indices are between -1/0 and len(self) if not allow_fill: if self._has_single_chunk: if (self.dtype.is_list and self.data.chunk(0).flatten().null_count == 0 and self.data.chunk(0).null_count == 0 and self.flatten().dtype._is_numeric): return FletcherArray( take_indices_on_pyarrow_list(self.data.chunk(0), indices)) else: return FletcherArray( self.data.chunk(0).take(pa.array(indices))) lengths = np.fromiter(map(len, self.data.iterchunks()), dtype=np.int) cum_lengths = lengths.cumsum() bins = self._get_chunk_indexer(indices) cum_lengths -= lengths limits_idx = np.concatenate( [[0], np.bincount(bins, minlength=self.data.num_chunks).cumsum()]) if pd.Series(bins).is_monotonic: del bins return self._take_on_chunks(indices, limits_idx=limits_idx, cum_lengths=cum_lengths) elif len(indices) / len(self) > threshold_ratio: # check which method is going to take less memory return self._take_on_concatenated_chunks(indices) else: sort_idx = get_group_index_sorter(bins, self.data.num_chunks) del bins indices = indices.take(sort_idx, out=indices) # type: ignore sort_idx = np.argsort(sort_idx, kind="merge") # inverse sort indices return self._take_on_chunks( indices, sort_idx=sort_idx, limits_idx=limits_idx, cum_lengths=cum_lengths, ) else: if pd.isnull(fill_value): fill_value = None return self._concat_same_type( [self, FletcherArray([fill_value], dtype=self.data.type)]).take(indices)