Esempio n. 1
0
    def get_indexer_non_unique(self, target):
        target = ibase.ensure_index(target)

        if isinstance(target, CategoricalIndex):
            # Indexing on codes is more efficient if categories are the same:
            if target.categories is self.categories:
                target = target.codes
                indexer, missing = self._engine.get_indexer_non_unique(target)
                return ensure_platform_int(indexer), missing
            target = target.values

        codes = self.categories.get_indexer(target)
        indexer, missing = self._engine.get_indexer_non_unique(codes)
        return ensure_platform_int(indexer), missing
Esempio n. 2
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):

        self._check_method(method)
        target = ensure_index(target)
        target = self._maybe_cast_indexed(target)

        if self.equals(target):
            return np.arange(len(self), dtype='intp')

        if self.is_non_overlapping_monotonic:
            start, stop = self._find_non_overlapping_monotonic_bounds(target)

            start_plus_one = start + 1
            if not ((start_plus_one < stop).any()):
                return np.where(start_plus_one == stop, start, -1)

        if not self.is_unique:
            raise ValueError("cannot handle non-unique indices")

        # IntervalIndex
        if isinstance(target, IntervalIndex):
            indexer = self._get_reindexer(target)

        # non IntervalIndex
        else:
            indexer = np.concatenate([self.get_loc(i) for i in target])

        return ensure_platform_int(indexer)
Esempio n. 3
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        from pandas.core.arrays.categorical import _recode_for_categories

        method = missing.clean_reindex_fill_method(method)
        target = ibase.ensure_index(target)

        if self.is_unique and self.equals(target):
            return np.arange(len(self), dtype='intp')

        if method == 'pad' or method == 'backfill':
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == 'nearest':
            raise NotImplementedError("method='nearest' not implemented yet "
                                      'for CategoricalIndex')

        if (isinstance(target, CategoricalIndex) and
                self.values.is_dtype_equal(target)):
            if self.values.equals(target.values):
                # we have the same codes
                codes = target.codes
            else:
                codes = _recode_for_categories(target.codes,
                                               target.categories,
                                               self.values.categories)
        else:
            if isinstance(target, CategoricalIndex):
                code_indexer = self.categories.get_indexer(target.categories)
                codes = take_1d(code_indexer, target.codes, fill_value=-1)
            else:
                codes = self.categories.get_indexer(target)

        indexer, _ = self._engine.get_indexer_non_unique(codes)
        return ensure_platform_int(indexer)
Esempio n. 4
0
def get_group_index_sorter(group_index, ngroups):
    """
    algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')
    """
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
                                   (count * np.log(count))))
    if do_groupsort:
        sorter, _ = algos.groupsort_indexer(ensure_int64(group_index),
                                            ngroups)
        return ensure_platform_int(sorter)
    else:
        return group_index.argsort(kind='mergesort')
Esempio n. 5
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        remaining_labels = self.sorted_labels[:-1]
        level_sizes = [len(x) for x in new_levels]

        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
        ngroups = len(obs_ids)

        comp_index = ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level] + self.lift
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index + self.lift
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ValueError('Index contains duplicate entries, '
                             'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Esempio n. 6
0
 def take(self, indices, axis=0, allow_fill=True,
          fill_value=None, **kwargs):
     nv.validate_take(tuple(), kwargs)
     indices = ensure_platform_int(indices)
     taken = self._assert_take_fillable(self.codes, indices,
                                        allow_fill=allow_fill,
                                        fill_value=fill_value,
                                        na_value=-1)
     return self._create_from_codes(taken)
Esempio n. 7
0
    def get_group_levels(self):
        if not self.compressed and len(self.groupings) == 1:
            return [self.groupings[0].result_index]

        name_list = []
        for ping, labels in zip(self.groupings, self.recons_labels):
            labels = ensure_platform_int(labels)
            levels = ping.result_index.take(labels)

            name_list.append(levels)

        return name_list
Esempio n. 8
0
    def take(self, indices, axis=0, allow_fill=True,
             fill_value=None, **kwargs):
        """
        Sparse-compatible version of ndarray.take

        Returns
        -------
        taken : ndarray
        """
        nv.validate_take(tuple(), kwargs)

        if axis:
            raise ValueError("axis must be 0, input was {axis}"
                             .format(axis=axis))

        if is_integer(indices):
            # return scalar
            return self[indices]

        indices = ensure_platform_int(indices)
        n = len(self)
        if allow_fill and fill_value is not None:
            # allow -1 to indicate self.fill_value,
            # self.fill_value may not be NaN
            if (indices < -1).any():
                msg = ('When allow_fill=True and fill_value is not None, '
                       'all indices must be >= -1')
                raise ValueError(msg)
            elif (n <= indices).any():
                msg = 'index is out of bounds for size {size}'.format(size=n)
                raise IndexError(msg)
        else:
            if ((indices < -n) | (n <= indices)).any():
                msg = 'index is out of bounds for size {size}'.format(size=n)
                raise IndexError(msg)

        indices = indices.astype(np.int32)
        if not (allow_fill and fill_value is not None):
            indices = indices.copy()
            indices[indices < 0] += n

        locs = self.sp_index.lookup_array(indices)
        indexer = np.arange(len(locs), dtype=np.int32)
        mask = locs != -1
        if mask.any():
            indexer = indexer[mask]
            new_values = self.sp_values.take(locs[mask])
        else:
            indexer = np.empty(shape=(0, ), dtype=np.int32)
            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
        return self._simple_new(new_values, sp_index, self.fill_value)
Esempio n. 9
0
File: ops.py Progetto: glyg/pandas
    def group_info(self):
        ngroups = self.ngroups
        obs_group_ids = np.arange(ngroups)
        rep = np.diff(np.r_[0, self.bins])

        rep = ensure_platform_int(rep)
        if ngroups == len(self.bins):
            comp_ids = np.repeat(np.arange(ngroups), rep)
        else:
            comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)

        return comp_ids.astype('int64', copy=False), \
            obs_group_ids.astype('int64', copy=False), ngroups
Esempio n. 10
0
    def get_group_levels(self) -> list[Index]:
        # Note: only called from _insert_inaxis_grouper_inplace, which
        #  is only called for BaseGrouper, never for BinGrouper
        if len(self.groupings) == 1:
            return [self.groupings[0].result_index]

        name_list = []
        for ping, codes in zip(self.groupings, self.reconstructed_codes):
            codes = ensure_platform_int(codes)
            levels = ping.result_index.take(codes)

            name_list.append(levels)

        return name_list
Esempio n. 11
0
    def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]:
        """
        pointwise implementation for get_indexer and get_indexer_non_unique.
        """
        indexer, missing = [], []
        for i, key in enumerate(target):
            try:
                locs = self.get_loc(key)
                if isinstance(locs, slice):
                    # Only needed for get_indexer_non_unique
                    locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp")
                locs = np.array(locs, ndmin=1)
            except KeyError:
                missing.append(i)
                locs = np.array([-1])
            except InvalidIndexError as err:
                # i.e. non-scalar key
                raise TypeError(key) from err

            indexer.append(locs)

        indexer = np.concatenate(indexer)
        return ensure_platform_int(indexer), ensure_platform_int(missing)
Esempio n. 12
0
 def take(self,
          indices,
          axis=0,
          allow_fill=True,
          fill_value=None,
          **kwargs):
     nv.validate_take(tuple(), kwargs)
     indices = ensure_platform_int(indices)
     taken = self._assert_take_fillable(self.codes,
                                        indices,
                                        allow_fill=allow_fill,
                                        fill_value=fill_value,
                                        na_value=-1)
     return self._create_from_codes(taken)
Esempio n. 13
0
    def get_indexer_non_unique(self,
                               target: Index) -> tuple[np.ndarray, np.ndarray]:
        # both returned ndarrays are np.intp
        target = ensure_index(target)

        if isinstance(target,
                      IntervalIndex) and not self._should_compare(target):
            # different closed or incompatible subtype -> no matches
            return self._get_indexer_non_comparable(target, None, unique=False)

        elif is_object_dtype(target.dtype) or isinstance(
                target, IntervalIndex):
            # target might contain intervals: defer elementwise to get_loc
            return self._get_indexer_pointwise(target)

        else:
            # Note: this case behaves differently from other Index subclasses
            #  because IntervalIndex does partial-int indexing
            target = self._maybe_convert_i8(target)
            indexer, missing = self._engine.get_indexer_non_unique(
                target.values)

        return ensure_platform_int(indexer), ensure_platform_int(missing)
Esempio n. 14
0
    def __init__(
        self,
        data: FrameOrSeries,
        labels: npt.NDArray[np.intp],
        ngroups: int,
        axis: int = 0,
    ):
        self.data = data
        self.labels = ensure_platform_int(
            labels)  # _should_ already be np.intp
        self.ngroups = ngroups

        self.axis = axis
        assert isinstance(axis, int), axis
Esempio n. 15
0
    def size(self):
        """
        Compute group sizes

        """
        ids, _, ngroup = self.group_info
        ids = ensure_platform_int(ids)
        if ngroup:
            out = np.bincount(ids[ids != -1], minlength=ngroup)
        else:
            out = ids
        return Series(out,
                      index=self.result_index,
                      dtype='int64')
Esempio n. 16
0
    def group_info(self):
        ngroups = self.ngroups
        obs_group_ids = np.arange(ngroups)
        rep = np.diff(np.r_[0, self.bins])

        rep = ensure_platform_int(rep)
        if ngroups == len(self.bins):
            comp_ids = np.repeat(np.arange(ngroups), rep)
        else:
            comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)

        return (comp_ids.astype('int64', copy=False),
                obs_group_ids.astype('int64', copy=False),
                ngroups)
Esempio n. 17
0
    def size(self):
        """
        Compute group sizes

        """
        ids, _, ngroup = self.group_info
        ids = ensure_platform_int(ids)
        if ngroup:
            out = np.bincount(ids[ids != -1], minlength=ngroup)
        else:
            out = ids
        return Series(out,
                      index=self.result_index,
                      dtype='int64')
Esempio n. 18
0
def test_transform_fast():

    df = DataFrame({"id": np.arange(100000) / 3, "val": np.random.randn(100000)})

    grp = df.groupby("id")["val"]

    values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values))
    expected = Series(values, index=df.index, name="val")

    result = grp.transform(np.mean)
    tm.assert_series_equal(result, expected)

    result = grp.transform("mean")
    tm.assert_series_equal(result, expected)

    # GH 12737
    df = DataFrame(
        {
            "grouping": [0, 1, 1, 3],
            "f": [1.1, 2.1, 3.1, 4.5],
            "d": pd.date_range("2014-1-1", "2014-1-4"),
            "i": [1, 2, 3, 4],
        },
        columns=["grouping", "f", "i", "d"],
    )
    result = df.groupby("grouping").transform("first")

    dates = [
        Timestamp("2014-1-1"),
        Timestamp("2014-1-2"),
        Timestamp("2014-1-2"),
        Timestamp("2014-1-4"),
    ]
    expected = DataFrame(
        {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]},
        columns=["f", "i", "d"],
    )
    tm.assert_frame_equal(result, expected)

    # selection
    result = df.groupby("grouping")[["f", "i"]].transform("first")
    expected = expected[["f", "i"]]
    tm.assert_frame_equal(result, expected)

    # dup columns
    df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"])
    result = df.groupby("g").transform("first")
    expected = df.drop("g", axis=1)
    tm.assert_frame_equal(result, expected)
Esempio n. 19
0
    def _reindex_index(self,
                       index,
                       method,
                       copy,
                       level,
                       fill_value=np.nan,
                       limit=None,
                       takeable=False):
        if level is not None:
            raise TypeError("Reindex by level not supported for sparse")

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return self._constructor(index=index,
                                     columns=self.columns).__finalize__(self)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.items():
            if mask.all():
                continue

            values = series.values
            # .take returns SparseArray
            new = values.take(indexer)
            if need_mask:
                new = new.to_dense()
                # convert integer to float if necessary. need to do a lot
                # more than that, handle boolean etc also
                new, fill_value = maybe_upcast(new, fill_value=fill_value)
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return self._constructor(
            new_series,
            index=index,
            columns=self.columns,
            default_fill_value=self._default_fill_value,
        ).__finalize__(self)
Esempio n. 20
0
def _take_nd_ndarray(
    arr: np.ndarray,
    indexer: npt.NDArray[np.intp] | None,
    axis: int,
    fill_value,
    allow_fill: bool,
) -> np.ndarray:

    if indexer is None:
        indexer = np.arange(arr.shape[axis], dtype=np.intp)
        dtype, fill_value = arr.dtype, arr.dtype.type()
    else:
        indexer = ensure_platform_int(indexer)

    dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
        arr, indexer, fill_value, allow_fill)

    flip_order = False
    if arr.ndim == 2 and arr.flags.f_contiguous:
        flip_order = True

    if flip_order:
        arr = arr.T
        axis = arr.ndim - axis - 1

    # at this point, it's guaranteed that dtype can hold both the arr values
    # and the fill_value
    out_shape_ = list(arr.shape)
    out_shape_[axis] = len(indexer)
    out_shape = tuple(out_shape_)
    if arr.flags.f_contiguous and axis == arr.ndim - 1:
        # minor tweak that can make an order-of-magnitude difference
        # for dataframes initialized directly from 2-d ndarrays
        # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
        # f-contiguous transpose)
        out = np.empty(out_shape, dtype=dtype, order="F")
    else:
        out = np.empty(out_shape, dtype=dtype)

    func = _get_take_nd_function(arr.ndim,
                                 arr.dtype,
                                 out.dtype,
                                 axis=axis,
                                 mask_info=mask_info)
    func(arr, indexer, out, fill_value)

    if flip_order:
        out = out.T
    return out
Esempio n. 21
0
    def get_indexer_non_unique(self, target):
        target = ensure_index(target)

        if isinstance(target, PeriodIndex):
            target = target.asi8
            if hasattr(target, "freq") and target.freq != self.freq:
                msg = DIFFERENT_FREQ.format(
                    cls=type(self).__name__,
                    own_freq=self.freqstr,
                    other_freq=target.freqstr,
                )
                raise IncompatibleFrequency(msg)

        indexer, missing = self._int64index.get_indexer_non_unique(target)
        return ensure_platform_int(indexer), missing
Esempio n. 22
0
    def _indexer_and_to_sort(self):
        v = self.level

        codes = list(self.index.codes)
        levs = list(self.index.levels)
        to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = ensure_platform_int(indexer)

        return indexer, to_sort
Esempio n. 23
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase.ensure_index(target)

        self._check_indexing_method(method)

        if self.is_unique and self.equals(target):
            return np.arange(len(self), dtype="intp")

        # Note: we use engine.get_indexer_non_unique below because, even if
        #  `target` is unique, any non-category entries in it will be encoded
        #  as -1 by _get_codes_for_get_indexer, so `codes` may not be unique.
        codes = self._get_codes_for_get_indexer(target._values)
        indexer, _ = self._engine.get_indexer_non_unique(codes)
        return ensure_platform_int(indexer)
Esempio n. 24
0
    def _make_sorted_values_labels(self):
        v = self.level

        codes = list(self.index.codes)
        levs = list(self.index.levels)
        to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = libalgos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = ensure_platform_int(indexer)

        self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Esempio n. 25
0
    def _make_sorted_values_labels(self):
        v = self.level

        codes = list(self.index.codes)
        levs = list(self.index.levels)
        to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = _algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = ensure_platform_int(indexer)

        self.sorted_values = algos.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Esempio n. 26
0
 def take(self,
          indices,
          axis=0,
          allow_fill=True,
          fill_value=None,
          **kwargs):
     nv.validate_take(tuple(), kwargs)
     indices = ensure_platform_int(indices)
     taken = self._assert_take_fillable(
         self._data,
         indices,
         allow_fill=allow_fill,
         fill_value=fill_value,
         na_value=self._data.dtype.na_value,
     )
     return self._shallow_copy(taken)
Esempio n. 27
0
    def get_indexer(
        self,
        target: AnyArrayLike,
        method: Optional[str] = None,
        limit: Optional[int] = None,
        tolerance: Optional[Any] = None,
    ) -> np.ndarray:

        self._check_indexing_method(method)

        if self.is_overlapping:
            raise InvalidIndexError("cannot handle overlapping indices; "
                                    "use IntervalIndex.get_indexer_non_unique")

        target_as_index = ensure_index(target)

        if isinstance(target_as_index, IntervalIndex):
            # equal indexes -> 1:1 positional match
            if self.equals(target_as_index):
                return np.arange(len(self), dtype="intp")

            if self._is_non_comparable_own_type(target_as_index):
                # different closed or incompatible subtype -> no matches
                return np.repeat(np.intp(-1), len(target_as_index))

            # non-overlapping -> at most one match per interval in target_as_index
            # want exact matches -> need both left/right to match, so defer to
            # left/right get_indexer, compare elementwise, equality -> match
            left_indexer = self.left.get_indexer(target_as_index.left)
            right_indexer = self.right.get_indexer(target_as_index.right)
            indexer = np.where(left_indexer == right_indexer, left_indexer, -1)
        elif is_categorical_dtype(target_as_index.dtype):
            target_as_index = cast("CategoricalIndex", target_as_index)
            # get an indexer for unique categories then propagate to codes via take_1d
            categories_indexer = self.get_indexer(target_as_index.categories)
            indexer = take_1d(categories_indexer,
                              target_as_index.codes,
                              fill_value=-1)
        elif not is_object_dtype(target_as_index):
            # homogeneous scalar index: use IntervalTree
            target_as_index = self._maybe_convert_i8(target_as_index)
            indexer = self._engine.get_indexer(target_as_index.values)
        else:
            # heterogeneous scalar index: defer elementwise to get_loc
            return self._get_indexer_pointwise(target_as_index)[0]

        return ensure_platform_int(indexer)
Esempio n. 28
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase.ensure_index(target)

        if self.is_unique and self.equals(target):
            return np.arange(len(self), dtype="intp")

        if method == "pad" or method == "backfill":
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == "nearest":
            raise NotImplementedError(
                "method='nearest' not implemented yet for CategoricalIndex")

        codes = self._values._validate_listlike(target._values)
        indexer, _ = self._engine.get_indexer_non_unique(codes)
        return ensure_platform_int(indexer)
Esempio n. 29
0
def get_group_index_sorter(group_index: np.ndarray,
                           ngroups: int | None = None) -> np.ndarray:
    """
    algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')

    Parameters
    ----------
    group_index : np.ndarray
        signed integer dtype
    ngroups : int or None, default None

    Returns
    -------
    np.ndarray[np.intp]
    """
    if ngroups is None:
        # error: Incompatible types in assignment (expression has type "number[Any]",
        # variable has type "Optional[int]")
        ngroups = 1 + group_index.max()  # type: ignore[assignment]
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    # error: Unsupported operand types for * ("float" and "None")
    do_groupsort = count > 0 and (
        (alpha + beta * ngroups) <
        (count * np.log(count))  # type: ignore[operator]
    )
    if do_groupsort:
        sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
        # sorter _should_ already be intp, but mypy is not yet able to verify
    else:
        sorter = group_index.argsort(kind="mergesort")
    return ensure_platform_int(sorter)
Esempio n. 30
0
    def test_engineless_lookup(self):
        # GH 16685
        # Standard lookup on RangeIndex should not require the engine to be
        # created
        idx = RangeIndex(2, 10, 3)

        assert idx.get_loc(5) == 1
        tm.assert_numpy_array_equal(idx.get_indexer([2, 8]),
                                    ensure_platform_int(np.array([0, 2])))
        with pytest.raises(KeyError):
            idx.get_loc(3)

        assert '_engine' not in idx._cache

        # The engine is still required for lookup of a different dtype scalar:
        with pytest.raises(KeyError):
            assert idx.get_loc('a') == -1

        assert '_engine' in idx._cache
Esempio n. 31
0
def test_transform_fast():

    df = DataFrame({'id': np.arange(100000) / 3,
                    'val': np.random.randn(100000)})

    grp = df.groupby('id')['val']

    values = np.repeat(grp.mean().values,
                       ensure_platform_int(grp.count().values))
    expected = pd.Series(values, index=df.index, name='val')

    result = grp.transform(np.mean)
    assert_series_equal(result, expected)

    result = grp.transform('mean')
    assert_series_equal(result, expected)

    # GH 12737
    df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
                       'd': pd.date_range('2014-1-1', '2014-1-4'),
                       'i': [1, 2, 3, 4]},
                      columns=['grouping', 'f', 'i', 'd'])
    result = df.groupby('grouping').transform('first')

    dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
             pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
    expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
                             'd': dates,
                             'i': [1, 2, 2, 4]},
                            columns=['f', 'i', 'd'])
    assert_frame_equal(result, expected)

    # selection
    result = df.groupby('grouping')[['f', 'i']].transform('first')
    expected = expected[['f', 'i']]
    assert_frame_equal(result, expected)

    # dup columns
    df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
    result = df.groupby('g').transform('first')
    expected = df.drop('g', axis=1)
    assert_frame_equal(result, expected)
Esempio n. 32
0
def test_transform_fast():

    df = DataFrame({'id': np.arange(100000) / 3,
                    'val': np.random.randn(100000)})

    grp = df.groupby('id')['val']

    values = np.repeat(grp.mean().values,
                       ensure_platform_int(grp.count().values))
    expected = pd.Series(values, index=df.index, name='val')

    result = grp.transform(np.mean)
    assert_series_equal(result, expected)

    result = grp.transform('mean')
    assert_series_equal(result, expected)

    # GH 12737
    df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5],
                       'd': pd.date_range('2014-1-1', '2014-1-4'),
                       'i': [1, 2, 3, 4]},
                      columns=['grouping', 'f', 'i', 'd'])
    result = df.groupby('grouping').transform('first')

    dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'),
             pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')]
    expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5],
                             'd': dates,
                             'i': [1, 2, 2, 4]},
                            columns=['f', 'i', 'd'])
    assert_frame_equal(result, expected)

    # selection
    result = df.groupby('grouping')[['f', 'i']].transform('first')
    expected = expected[['f', 'i']]
    assert_frame_equal(result, expected)

    # dup columns
    df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a'])
    result = df.groupby('g').transform('first')
    expected = df.drop('g', axis=1)
    assert_frame_equal(result, expected)
Esempio n. 33
0
def take_1d(
    arr: ArrayLike,
    indexer: npt.NDArray[np.intp],
    fill_value=None,
    allow_fill: bool = True,
) -> ArrayLike:
    """
    Specialized version for 1D arrays. Differences compared to `take_nd`:

    - Assumes input array has already been converted to numpy array / EA
    - Assumes indexer is already guaranteed to be intp dtype ndarray
    - Only works for 1D arrays

    To ensure the lowest possible overhead.

    Note: similarly to `take_nd`, this function assumes that the indexer is
    a valid(ated) indexer with no out of bound indices.
    """
    indexer = ensure_platform_int(indexer)

    if not isinstance(arr, np.ndarray):
        # ExtensionArray -> dispatch to their method
        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

    if not allow_fill:
        return arr.take(indexer)

    dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
        arr, indexer, fill_value, True)

    # at this point, it's guaranteed that dtype can hold both the arr values
    # and the fill_value
    out = np.empty(indexer.shape, dtype=dtype)

    func = _get_take_nd_function(arr.ndim,
                                 arr.dtype,
                                 out.dtype,
                                 axis=0,
                                 mask_info=mask_info)
    func(arr, indexer, out, fill_value)

    return out
Esempio n. 34
0
    def _get_indexer_non_unique(self, values: ArrayLike):
        """
        get_indexer_non_unique but after unrapping the target Index object.
        """
        # Note: we use engine.get_indexer_non_unique for get_indexer in addition
        #  to get_indexer_non_unique because, even if `target` is unique, any
        #  non-category entries in it will be encoded as -1  so `codes` may
        #  not be unique.

        if isinstance(values, Categorical):
            # Indexing on codes is more efficient if categories are the same,
            #  so we can apply some optimizations based on the degree of
            #  dtype-matching.
            cat = self._data._encode_with_my_categories(values)
            codes = cat._codes
        else:
            codes = self.categories.get_indexer(values)

        indexer, missing = self._engine.get_indexer_non_unique(codes)
        return ensure_platform_int(indexer), missing
Esempio n. 35
0
    def test_engineless_lookup(self):
        # GH 16685
        # Standard lookup on RangeIndex should not require the engine to be
        # created
        idx = RangeIndex(2, 10, 3)

        assert idx.get_loc(5) == 1
        tm.assert_numpy_array_equal(idx.get_indexer([2, 8]),
                                    ensure_platform_int(np.array([0, 2])))
        with pytest.raises(KeyError, match="3"):
            idx.get_loc(3)

        assert "_engine" not in idx._cache

        # Different types of scalars can be excluded immediately, no need to
        #  use the _engine
        with pytest.raises(KeyError, match="'a'"):
            idx.get_loc("a")

        assert "_engine" not in idx._cache
Esempio n. 36
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        method = missing.clean_reindex_fill_method(method)
        target = ibase.ensure_index(target)

        if self.is_unique and self.equals(target):
            return np.arange(len(self), dtype="intp")

        if method == "pad" or method == "backfill":
            raise NotImplementedError("method='pad' and method='backfill' not "
                                      "implemented yet for CategoricalIndex")
        elif method == "nearest":
            raise NotImplementedError(
                "method='nearest' not implemented yet for CategoricalIndex")

        # Note: we use engine.get_indexer_non_unique below because, even if
        #  `target` is unique, any non-category entries in it will be encoded
        #  as -1 by _get_codes_for_get_indexer, so `codes` may not be unique.
        codes = self._get_codes_for_get_indexer(target._values)
        indexer, _ = self._engine.get_indexer_non_unique(codes)
        return ensure_platform_int(indexer)
Esempio n. 37
0
    def unstack(self, unstacker, fill_value) -> ArrayManager:
        """
        Return a BlockManager with all blocks unstacked..

        Parameters
        ----------
        unstacker : reshape._Unstacker
        fill_value : Any
            fill_value for newly introduced missing values.

        Returns
        -------
        unstacked : BlockManager
        """
        indexer, _ = unstacker._indexer_and_to_sort
        if unstacker.mask.all():
            new_indexer = indexer
            allow_fill = False
        else:
            new_indexer = np.full(unstacker.mask.shape, -1)
            new_indexer[unstacker.mask] = indexer
            allow_fill = True
        new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
        new_indexer2D = ensure_platform_int(new_indexer2D)

        new_arrays = []
        for arr in self.arrays:
            for i in range(unstacker.full_shape[1]):
                new_arr = take_1d(
                    arr,
                    new_indexer2D[:, i],
                    allow_fill=allow_fill,
                    fill_value=fill_value,
                )
                new_arrays.append(new_arr)

        new_index = unstacker.new_index
        new_columns = unstacker.get_new_columns(self._axes[1])
        new_axes = [new_index, new_columns]

        return type(self)(new_arrays, new_axes, verify_integrity=False)
Esempio n. 38
0
def _take_nd_object(
    arr: np.ndarray,
    indexer: np.ndarray,
    out: np.ndarray,
    axis: int,
    fill_value,
    mask_info,
):
    if mask_info is not None:
        mask, needs_masking = mask_info
    else:
        mask = indexer == -1
        needs_masking = mask.any()
    if arr.dtype != out.dtype:
        arr = arr.astype(out.dtype)
    if arr.shape[axis] > 0:
        arr.take(ensure_platform_int(indexer), axis=axis, out=out)
    if needs_masking:
        outindexer = [slice(None)] * arr.ndim
        outindexer[axis] = mask
        out[tuple(outindexer)] = fill_value
Esempio n. 39
0
    def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
                       limit=None, takeable=False):
        if level is not None:
            raise TypeError('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return self._constructor(
                index=index, columns=self.columns).__finalize__(self)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.iteritems():
            if mask.all():
                continue

            values = series.values
            # .take returns SparseArray
            new = values.take(indexer)
            if need_mask:
                new = new.values
                # convert integer to float if necessary. need to do a lot
                # more than that, handle boolean etc also
                new, fill_value = maybe_upcast(new, fill_value=fill_value)
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return self._constructor(
            new_series, index=index, columns=self.columns,
            default_fill_value=self._default_fill_value).__finalize__(self)
Esempio n. 40
0
    def _get_indexer(
        self,
        target: Index,
        method: str | None = None,
        limit: int | None = None,
        tolerance: Any | None = None,
    ) -> np.ndarray:
        # returned ndarray is np.intp

        if isinstance(target, IntervalIndex):
            # equal indexes -> 1:1 positional match
            if self.equals(target):
                return np.arange(len(self), dtype="intp")

            if not self._should_compare(target):
                return self._get_indexer_non_comparable(target,
                                                        method,
                                                        unique=True)

            # non-overlapping -> at most one match per interval in target
            # want exact matches -> need both left/right to match, so defer to
            # left/right get_indexer, compare elementwise, equality -> match
            left_indexer = self.left.get_indexer(target.left)
            right_indexer = self.right.get_indexer(target.right)
            indexer = np.where(left_indexer == right_indexer, left_indexer, -1)
        elif is_categorical_dtype(target.dtype):
            target = cast("CategoricalIndex", target)
            # get an indexer for unique categories then propagate to codes via take_nd
            categories_indexer = self.get_indexer(target.categories)
            indexer = take_nd(categories_indexer, target.codes, fill_value=-1)
        elif not is_object_dtype(target):
            # homogeneous scalar index: use IntervalTree
            target = self._maybe_convert_i8(target)
            indexer = self._engine.get_indexer(target.values)
        else:
            # heterogeneous scalar index: defer elementwise to get_loc
            return self._get_indexer_pointwise(target)[0]

        return ensure_platform_int(indexer)
Esempio n. 41
0
def get_group_index_sorter(group_index, ngroups):
    """
    algos.groupsort_indexer implements `counting sort` and it is at least
    O(ngroups), where
        ngroups = prod(shape)
        shape = map(len, keys)
    that is, linear in the number of combinations (cartesian product) of unique
    values of groupby keys. This can be huge when doing multi-key groupby.
    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
    length of the data-frame;
    Both algorithms are `stable` sort and that is necessary for correctness of
    groupby operations. e.g. consider:
        df.groupby(key)[col].transform('first')
    """
    count = len(group_index)
    alpha = 0.0  # taking complexities literally; there may be
    beta = 1.0  # some room for fine-tuning these parameters
    do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
    if do_groupsort:
        sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
        return ensure_platform_int(sorter)
    else:
        return group_index.argsort(kind="mergesort")
Esempio n. 42
0
    def _get_indexer(
        self,
        target: Index,
        method: str | None = None,
        limit: int | None = None,
        tolerance=None,
    ) -> np.ndarray:
        # -> np.ndarray[np.intp]
        if com.any_not_none(method, tolerance, limit):
            return super()._get_indexer(target,
                                        method=method,
                                        tolerance=tolerance,
                                        limit=limit)

        if self.step > 0:
            start, stop, step = self.start, self.stop, self.step
        else:
            # GH 28678: work on reversed range for simplicity
            reverse = self._range[::-1]
            start, stop, step = reverse.start, reverse.stop, reverse.step

        if not is_signed_integer_dtype(target):
            # checks/conversions/roundings are delegated to general method
            return super()._get_indexer(target,
                                        method=method,
                                        tolerance=tolerance)

        target_array = np.asarray(target)
        locs = target_array - start
        valid = (locs % step == 0) & (locs >= 0) & (target_array < stop)
        locs[~valid] = -1
        locs[valid] = locs[valid] / step

        if step != self.step:
            # We reversed this range: transform to original locs
            locs[valid] = len(self) - 1 - locs[valid]
        return ensure_platform_int(locs)
Esempio n. 43
0
    def get_indexer(self, target, method=None, limit=None, tolerance=None):
        if not (method is None and tolerance is None and is_list_like(target)):
            return super().get_indexer(target, method=method, tolerance=tolerance)

        if self.step > 0:
            start, stop, step = self.start, self.stop, self.step
        else:
            # Work on reversed range for simplicity:
            start, stop, step = (self.stop - self.step, self.start + 1, -self.step)

        target_array = np.asarray(target)
        if not (is_integer_dtype(target_array) and target_array.ndim == 1):
            # checks/conversions/roundings are delegated to general method
            return super().get_indexer(target, method=method, tolerance=tolerance)

        locs = target_array - start
        valid = (locs % step == 0) & (locs >= 0) & (target_array < stop)
        locs[~valid] = -1
        locs[valid] = locs[valid] / step

        if step != self.step:
            # We reversed this range: transform to original locs
            locs[valid] = len(self) - 1 - locs[valid]
        return ensure_platform_int(locs)
Esempio n. 44
0
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
    """
    Sort ``values`` and reorder corresponding ``labels``.
    ``values`` should be unique if ``labels`` is not None.
    Safe for use with mixed types (int, str), orders ints before strs.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    values : list-like
        Sequence; must be unique if ``labels`` is not None.
    labels : list_like
        Indices to ``values``. All out of bound indices are treated as
        "not found" and will be masked with ``na_sentinel``.
    na_sentinel : int, default -1
        Value in ``labels`` to mark "not found".
        Ignored when ``labels`` is None.
    assume_unique : bool, default False
        When True, ``values`` are assumed to be unique, which can speed up
        the calculation. Ignored when ``labels`` is None.

    Returns
    -------
    ordered : ndarray
        Sorted ``values``
    new_labels : ndarray
        Reordered ``labels``; returned when ``labels`` is not None.

    Raises
    ------
    TypeError
        * If ``values`` is not list-like or if ``labels`` is neither None
        nor list-like
        * If ``values`` cannot be sorted
    ValueError
        * If ``labels`` is not None and ``values`` contain duplicates.
    """
    if not is_list_like(values):
        raise TypeError("Only list-like objects are allowed to be passed to"
                        "safe_sort as values")

    if not isinstance(values, np.ndarray):

        # don't convert to string types
        dtype, _ = infer_dtype_from_array(values)
        values = np.asarray(values, dtype=dtype)

    def sort_mixed(values):
        # order ints before strings, safe in py3
        str_pos = np.array([isinstance(x, string_types) for x in values],
                           dtype=bool)
        nums = np.sort(values[~str_pos])
        strs = np.sort(values[str_pos])
        return np.concatenate([nums, np.asarray(strs, dtype=object)])

    sorter = None
    if PY3 and lib.infer_dtype(values, skipna=False) == 'mixed-integer':
        # unorderable in py3 if mixed str/int
        ordered = sort_mixed(values)
    else:
        try:
            sorter = values.argsort()
            ordered = values.take(sorter)
        except TypeError:
            # try this anyway
            ordered = sort_mixed(values)

    # labels:

    if labels is None:
        return ordered

    if not is_list_like(labels):
        raise TypeError("Only list-like objects or None are allowed to be"
                        "passed to safe_sort as labels")
    labels = ensure_platform_int(np.asarray(labels))

    from pandas import Index
    if not assume_unique and not Index(values).is_unique:
        raise ValueError("values should be unique if labels is not None")

    if sorter is None:
        # mixed types
        (hash_klass, _), values = algorithms._get_data_algo(
            values, algorithms._hashtables)
        t = hash_klass(len(values))
        t.map_locations(values)
        sorter = ensure_platform_int(t.lookup(ordered))

    reverse_indexer = np.empty(len(sorter), dtype=np.int_)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    mask = (labels < -len(values)) | (labels >= len(values)) | \
        (labels == na_sentinel)

    # (Out of bound indices will be masked with `na_sentinel` next, so we may
    # deal with them here without performance loss using `mode='wrap'`.)
    new_labels = reverse_indexer.take(labels, mode='wrap')
    np.putmask(new_labels, mask, na_sentinel)

    return ordered, ensure_platform_int(new_labels)