Esempio n. 1
0
File: ops.py Progetto: tnir/pandas
    def result_ilocs(self) -> npt.NDArray[np.intp]:
        """
        Get the original integer locations of result_index in the input.
        """
        # Original indices are where group_index would go via sorting.
        # But when dropna is true, we need to remove null values while accounting for
        # any gaps that then occur because of them.
        group_index = get_group_index(
            self.codes, self.shape, sort=self._sort, xnull=True
        )
        group_index, _ = compress_group_index(group_index, sort=self._sort)

        if self.has_dropped_na:
            mask = np.where(group_index >= 0)
            # Count how many gaps are caused by previous null values for each position
            null_gaps = np.cumsum(group_index == -1)[mask]
            group_index = group_index[mask]

        result = get_group_index_sorter(group_index, self.ngroups)

        if self.has_dropped_na:
            # Shift by the number of prior null gaps
            result += np.take(null_gaps, result)

        return result
Esempio n. 2
0
    def _indexer_and_to_sort(self):
        v = self.level

        codes = list(self.index.codes)
        levs = list(self.index.levels)
        to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = get_group_index_sorter(comp_index, ngroups)
        indexer = ensure_platform_int(indexer)
        return indexer, to_sort
Esempio n. 3
0
    def _aggregate_series_fast(self, obj, func):
        # At this point we have already checked that obj.index is not a MultiIndex
        #  and that obj is backed by an ndarray, not ExtensionArray
        func = self._is_builtin_func(func)

        group_index, _, ngroups = self.group_info

        # avoids object / Series creation overhead
        dummy = obj._get_values(slice(None, 0))
        indexer = get_group_index_sorter(group_index, ngroups)
        obj = obj.take(indexer)
        group_index = algorithms.take_nd(group_index, indexer, allow_fill=False)
        grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy)
        result, counts = grouper.get_result()
        return result, counts
Esempio n. 4
0
File: ops.py Progetto: vdasu/pandas
    def _aggregate_series_fast(self, obj, func):
        func = self._is_builtin_func(func)

        if obj.index._has_complex_internals:
            raise TypeError("Incompatible index for Cython grouper")

        group_index, _, ngroups = self.group_info

        # avoids object / Series creation overhead
        dummy = obj._get_values(slice(None, 0))
        indexer = get_group_index_sorter(group_index, ngroups)
        obj = obj.take(indexer)
        group_index = algorithms.take_nd(group_index, indexer, allow_fill=False)
        grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy)
        result, counts = grouper.get_result()
        return result, counts
Esempio n. 5
0
File: ops.py Progetto: rth/pandas
    def _aggregate_series_fast(self, obj: Series, func: F) -> npt.NDArray[np.object_]:
        # At this point we have already checked that
        #  - obj.index is not a MultiIndex
        #  - obj is backed by an ndarray, not ExtensionArray
        #  - len(obj) > 0
        func = com.is_builtin_func(func)

        ids, _, ngroups = self.group_info

        # avoids object / Series creation overhead
        indexer = get_group_index_sorter(ids, ngroups)
        obj = obj.take(indexer)
        ids = ids.take(indexer)
        sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
        result, _ = sgrouper.get_result()
        return result
Esempio n. 6
0
    def _aggregate_series_fast(self, obj: Series, func: F):
        # At this point we have already checked that
        #  - obj.index is not a MultiIndex
        #  - obj is backed by an ndarray, not ExtensionArray
        #  - len(obj) > 0
        #  - ngroups != 0
        func = self._is_builtin_func(func)

        group_index, _, ngroups = self.group_info

        # avoids object / Series creation overhead
        indexer = get_group_index_sorter(group_index, ngroups)
        obj = obj.take(indexer)
        group_index = group_index.take(indexer)
        grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups)
        result, counts = grouper.get_result()
        return result, counts
Esempio n. 7
0
    def _indexer_and_to_sort(
        self,
    ) -> tuple[npt.NDArray[np.intp],
               list[np.ndarray],  # each has _some_ signed integer dtype
               ]:
        v = self.level

        codes = list(self.index.codes)
        levs = list(self.index.levels)
        to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
        sizes = tuple(len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]])

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = get_group_index_sorter(comp_index, ngroups)
        return indexer, to_sort
Esempio n. 8
0
    def _aggregate_series_fast(self, obj, func):
        func = self._is_builtin_func(func)

        if obj.index._has_complex_internals:
            raise TypeError('Incompatible index for Cython grouper')

        group_index, _, ngroups = self.group_info

        # avoids object / Series creation overhead
        dummy = obj._get_values(slice(None, 0)).to_dense()
        indexer = get_group_index_sorter(group_index, ngroups)
        obj = obj._take(indexer).to_dense()
        group_index = algorithms.take_nd(
            group_index, indexer, allow_fill=False)
        grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups,
                                          dummy)
        result, counts = grouper.get_result()
        return result, counts
Esempio n. 9
0
    def _aggregate_series_fast(self, obj, func):
        func = self._is_builtin_func(func)

        # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA
        #   for EAs backed by ndarray we may have a performant workaround
        if obj.index._has_complex_internals:
            raise TypeError("Incompatible index for Cython grouper")

        group_index, _, ngroups = self.group_info

        # avoids object / Series creation overhead
        dummy = obj._get_values(slice(None, 0))
        indexer = get_group_index_sorter(group_index, ngroups)
        obj = obj.take(indexer)
        group_index = algorithms.take_nd(group_index, indexer, allow_fill=False)
        grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy)
        result, counts = grouper.get_result()
        return result, counts
Esempio n. 10
0
 def _sort_idx(self) -> npt.NDArray[np.intp]:
     # Counting sort indexer
     return get_group_index_sorter(self.labels, self.ngroups)
Esempio n. 11
0
 def sort_idx(self):
     # Counting sort indexer
     return get_group_index_sorter(self.labels, self.ngroups)
Esempio n. 12
0
 def sort_idx(self):
     # Counting sort indexer
     return get_group_index_sorter(self.labels, self.ngroups)
Esempio n. 13
0
    def take(self, indices, allow_fill=False, fill_value=None):
        # type: (Sequence[int] , bool, Optional[Any]) -> FletcherArray
        """
        Take elements from an array.

        Parameters
        ----------
        indices : sequence of integers
            Indices to be taken.
        allow_fill : bool, default False
            How to handle negative values in `indices`.
            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.
            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.
        fill_value : any, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.
            For many FletcherArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if nescessary.

        Returns
        -------
        FletcherArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.

        Notes
        -----
        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignemnt, with a `fill_value`.


        Notes
        -----
        FletcherArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignemnt, with a `fill_value`.

        See Also
        --------
        numpy.take
        pandas.api.extensions.take
        """
        threshold_ratio = 0.3

        # this is the threshold to decide whether or not to concat everything first.
        # Benchmarks were made on string, int32, int64, float32, float64 and it turns out that 0.3 is the value where it
        # is best to switch to concatening everything first, both time-wise and memory-wise

        length = len(self)
        indices = np.asarray(indices, dtype=self._indices_dtype)
        has_negative_indices = np.any(indices < 0)  # type: ignore
        allow_fill &= has_negative_indices
        if allow_fill:
            validate_indices(indices, length)
        if (has_negative_indices
                and not allow_fill) or np.any(indices >= length  # type: ignore
                                              ):
            # this will raise IndexError expected by pandas in all needed cases
            indices = np.arange(length,
                                dtype=self._indices_dtype).take(indices)
        # here we guarantee that indices is numpy array of ints
        # and we have checked that all indices are between -1/0 and len(self)

        if not allow_fill:

            if self._has_single_chunk:
                if (self.dtype.is_list
                        and self.data.chunk(0).flatten().null_count == 0
                        and self.data.chunk(0).null_count == 0
                        and self.flatten().dtype._is_numeric):
                    return FletcherArray(
                        take_indices_on_pyarrow_list(self.data.chunk(0),
                                                     indices))
                else:
                    return FletcherArray(
                        self.data.chunk(0).take(pa.array(indices)))

            lengths = np.fromiter(map(len, self.data.iterchunks()),
                                  dtype=np.int)
            cum_lengths = lengths.cumsum()

            bins = self._get_chunk_indexer(indices)

            cum_lengths -= lengths
            limits_idx = np.concatenate(
                [[0],
                 np.bincount(bins, minlength=self.data.num_chunks).cumsum()])

            if pd.Series(bins).is_monotonic:
                del bins
                return self._take_on_chunks(indices,
                                            limits_idx=limits_idx,
                                            cum_lengths=cum_lengths)
            elif len(indices) / len(self) > threshold_ratio:
                # check which method is going to take less memory
                return self._take_on_concatenated_chunks(indices)
            else:
                sort_idx = get_group_index_sorter(bins, self.data.num_chunks)
                del bins
                indices = indices.take(sort_idx, out=indices)  # type: ignore
                sort_idx = np.argsort(sort_idx,
                                      kind="merge")  # inverse sort indices
                return self._take_on_chunks(
                    indices,
                    sort_idx=sort_idx,
                    limits_idx=limits_idx,
                    cum_lengths=cum_lengths,
                )

        else:
            if pd.isnull(fill_value):
                fill_value = None
            return self._concat_same_type(
                [self, FletcherArray([fill_value],
                                     dtype=self.data.type)]).take(indices)