Example #1
0
    def _replace_with_indices(
        cls,
        chunk: pa.Array,
        indices: npt.NDArray[np.intp],
        value: npt.NDArray[Any],
    ) -> pa.Array:
        """
        Replace items selected with a set of positional indices.

        Analogous to pyarrow.compute.replace_with_mask, except that replacement
        positions are identified via indices rather than a mask.

        Parameters
        ----------
        chunk : pa.Array
        indices : npt.NDArray[np.intp]
        value : npt.NDArray[Any]
            Replacement value(s).

        Returns
        -------
        pa.Array
        """
        n = len(indices)

        if n == 0:
            return chunk

        start, stop = indices[[0, -1]]

        if (stop - start) == (n - 1):
            # fast path for a contiguous set of indices
            arrays = [
                chunk[:start],
                pa.array(value, type=chunk.type, from_pandas=True),
                chunk[stop + 1:],
            ]
            arrays = [arr for arr in arrays if len(arr)]
            if len(arrays) == 1:
                return arrays[0]
            return pa.concat_arrays(arrays)

        mask = np.zeros(len(chunk), dtype=np.bool_)
        mask[indices] = True

        if pa_version_under5p0:
            arr = chunk.to_numpy(zero_copy_only=False)
            arr[mask] = value
            return pa.array(arr, type=chunk.type)

        if isna(value).all():
            return pc.if_else(mask, None, chunk)

        return pc.replace_with_mask(chunk, mask, value)
Example #2
0
    def _set_via_chunk_iteration(self, indices: npt.NDArray[np.intp],
                                 value: npt.NDArray[Any]) -> pa.ChunkedArray:
        """
        Loop through the array chunks and set the new values while
        leaving the chunking layout unchanged.
        """

        chunk_indices = self._within_chunk_indices(indices)
        new_data = []

        for i, chunk in enumerate(self._data.iterchunks()):

            c_ind = chunk_indices[i]
            n = len(c_ind)
            c_value, value = value[:n], value[n:]

            if n == 1:
                # fast path
                chunk = self._set_single_index_in_chunk(
                    chunk, c_ind[0], c_value[0])
            elif n > 0:
                mask = np.zeros(len(chunk), dtype=np.bool_)
                mask[c_ind] = True
                if not pa_version_under5p0:
                    if c_value is None or isna(np.array(c_value)).all():
                        chunk = pc.if_else(mask, None, chunk)
                    else:
                        chunk = pc.replace_with_mask(chunk, mask, c_value)
                else:
                    # The pyarrow compute functions were added in
                    # version 5.0. For prior versions we implement
                    # our own by converting to numpy and back.
                    chunk = chunk.to_numpy(zero_copy_only=False)
                    chunk[mask] = c_value
                    chunk = pa.array(chunk, type=pa.string())

            new_data.append(chunk)

        return pa.chunked_array(new_data)