Ejemplo n.º 1
0
    def test_labels(self):
        values = [3, 1, 2, 0, 4]
        expected = np.array([0, 1, 2, 3, 4])

        labels = [0, 1, 1, 2, 3, 0, -1, 4]
        result, result_labels = algos.safe_sort(values, labels)
        expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4])
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_labels, expected_labels)

        # na_sentinel
        labels = [0, 1, 1, 2, 3, 0, 99, 4]
        result, result_labels = algos.safe_sort(values, labels,
                                                na_sentinel=99)
        expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4])
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_labels, expected_labels)

        # out of bound indices
        labels = [0, 101, 102, 2, 3, 0, 99, 4]
        result, result_labels = algos.safe_sort(values, labels)
        expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4])
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_labels, expected_labels)

        labels = []
        result, result_labels = algos.safe_sort(values, labels)
        expected_labels = np.array([], dtype=np.int_)
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_labels, expected_labels)
Ejemplo n.º 2
0
    def test_mixed_integer(self):
        values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object)
        result = algos.safe_sort(values)
        expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

        values = np.array(['b', 1, 0, 'a'], dtype=object)
        labels = [0, 1, 2, 3, 0, -1, 1]
        result, result_labels = algos.safe_sort(values, labels)
        expected = np.array([0, 1, 'a', 'b'], dtype=object)
Ejemplo n.º 3
0
    def test_basic_sort(self):
        values = [3, 1, 2, 0, 4]
        result = algos.safe_sort(values)
        expected = np.array([0, 1, 2, 3, 4])
        tm.assert_numpy_array_equal(result, expected)

        values = list("baaacb")
        result = algos.safe_sort(values)
        expected = np.array(list("aaabbc"))
        tm.assert_numpy_array_equal(result, expected)

        values = []
        result = algos.safe_sort(values)
        expected = np.array([])
        tm.assert_numpy_array_equal(result, expected)
Ejemplo n.º 4
0
    def test_exceptions(self):
        with tm.assertRaisesRegexp(TypeError,
                                   "Only list-like objects are allowed"):
            algos.safe_sort(values=1)

        with tm.assertRaisesRegexp(TypeError,
                                   "Only list-like objects or None"):
            algos.safe_sort(values=[0, 1, 2], labels=1)

        with tm.assertRaisesRegexp(ValueError, "values should be unique"):
            algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1])
Ejemplo n.º 5
0
    def normalize_dictlike_arg(
        self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict
    ) -> AggFuncTypeDict:
        """
        Handler for dict-like argument.

        Ensures that necessary columns exist if obj is a DataFrame, and
        that a nested renamer is not passed. Also normalizes to all lists
        when values consists of a mix of list and non-lists.
        """
        assert how in ("apply", "agg", "transform")

        # Can't use func.values(); wouldn't work for a Series
        if (
            how == "agg"
            and isinstance(obj, ABCSeries)
            and any(is_list_like(v) for _, v in func.items())
        ) or (any(is_dict_like(v) for _, v in func.items())):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        if obj.ndim != 1:
            # Check for missing columns on a frame
            cols = set(func.keys()) - set(obj.columns)
            if len(cols) > 0:
                cols_sorted = list(safe_sort(list(cols)))
                raise KeyError(f"Column(s) {cols_sorted} do not exist")

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        # Cannot use func.values() because arg may be a Series
        if any(is_aggregator(x) for _, x in func.items()):
            new_func: AggFuncTypeDict = {}
            for k, v in func.items():
                if not is_aggregator(v):
                    # mypy can't realize v is not a list here
                    new_func[k] = [v]  # type:ignore[list-item]
                else:
                    new_func[k] = v
            func = new_func
        return func
Ejemplo n.º 6
0
def transform_dict_like(
    obj: FrameOrSeries,
    func: AggFuncTypeDict,
    *args,
    **kwargs,
):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if len(func) == 0:
        raise ValueError("No transform functions were provided")

    if obj.ndim != 1:
        # Check for missing columns on a frame
        cols = set(func.keys()) - set(obj.columns)
        if len(cols) > 0:
            cols_sorted = list(safe_sort(list(cols)))
            raise SpecificationError(f"Column(s) {cols_sorted} do not exist")

    # Can't use func.values(); wouldn't work for a Series
    if any(is_dict_like(v) for _, v in func.items()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    results: Dict[Hashable, FrameOrSeriesUnion] = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as err:
            if (
                str(err) == "Function did not transform"
                or str(err) == "No transform functions were provided"
            ):
                raise err

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
Ejemplo n.º 7
0
    def validate_dictlike_arg(self, how: str, obj: FrameOrSeriesUnion,
                              func: AggFuncTypeDict) -> None:
        """
        Raise if dict-like argument is invalid.

        Ensures that necessary columns exist if obj is a DataFrame, and
        that a nested renamer is not passed.
        """
        assert how in ("apply", "agg", "transform")

        # Can't use func.values(); wouldn't work for a Series
        if (how == "agg" and isinstance(obj, ABCSeries)
                and any(is_list_like(v) for _, v in func.items())) or (any(
                    is_dict_like(v) for _, v in func.items())):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        if obj.ndim != 1:
            # Check for missing columns on a frame
            cols = set(func.keys()) - set(obj.columns)
            if len(cols) > 0:
                cols_sorted = list(safe_sort(list(cols)))
                raise KeyError(f"Column(s) {cols_sorted} do not exist")
Ejemplo n.º 8
0
 def test_mixed_integer_from_list(self):
     values = ["b", 1, 0, "a", 0, "b"]
     result = safe_sort(values)
     expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
Ejemplo n.º 9
0
def assert_index_equal(
    left: Index,
    right: Index,
    exact: bool | str = "equiv",
    check_names: bool = True,
    check_less_precise: bool | int | NoDefault = no_default,
    check_exact: bool = True,
    check_categorical: bool = True,
    check_order: bool = True,
    rtol: float = 1.0e-5,
    atol: float = 1.0e-8,
    obj: str = "Index",
) -> None:
    """
    Check that left and right Index are equal.

    Parameters
    ----------
    left : Index
    right : Index
    exact : bool or {'equiv'}, default 'equiv'
        Whether to check the Index class, dtype and inferred_type
        are identical. If 'equiv', then RangeIndex can be substituted for
        Int64Index as well.
    check_names : bool, default True
        Whether to check the names attribute.
    check_less_precise : bool or int, default False
        Specify comparison precision. Only used when check_exact is False.
        5 digits (False) or 3 digits (True) after decimal points are compared.
        If int, then specify the digits to compare.

        .. deprecated:: 1.1.0
           Use `rtol` and `atol` instead to define relative/absolute
           tolerance, respectively. Similar to :func:`math.isclose`.
    check_exact : bool, default True
        Whether to compare number exactly.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_order : bool, default True
        Whether to compare the order of index entries as well as their values.
        If True, both indexes must contain the same elements, in the same order.
        If False, both indexes must contain the same elements, but in any order.

        .. versionadded:: 1.2.0
    rtol : float, default 1e-5
        Relative tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    atol : float, default 1e-8
        Absolute tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    obj : str, default 'Index'
        Specify object name being compared, internally used to show appropriate
        assertion message.

    Examples
    --------
    >>> from pandas import testing as tm
    >>> a = pd.Index([1, 2, 3])
    >>> b = pd.Index([1, 2, 3])
    >>> tm.assert_index_equal(a, b)
    """
    __tracebackhide__ = True

    def _check_types(left, right, obj="Index") -> None:
        if not exact:
            return

        assert_class_equal(left, right, exact=exact, obj=obj)
        assert_attr_equal("inferred_type", left, right, obj=obj)

        # Skip exact dtype checking when `check_categorical` is False
        if is_categorical_dtype(left.dtype) and is_categorical_dtype(
                right.dtype):
            if check_categorical:
                assert_attr_equal("dtype", left, right, obj=obj)
                assert_index_equal(left.categories,
                                   right.categories,
                                   exact=exact)
            return

        assert_attr_equal("dtype", left, right, obj=obj)

    def _get_ilevel_values(index, level):
        # accept level number only
        unique = index.levels[level]
        level_codes = index.codes[level]
        filled = take_nd(unique._values,
                         level_codes,
                         fill_value=unique._na_value)
        return unique._shallow_copy(filled, name=index.names[level])

    if check_less_precise is not no_default:
        warnings.warn(
            "The 'check_less_precise' keyword in testing.assert_*_equal "
            "is deprecated and will be removed in a future version. "
            "You can stop passing 'check_less_precise' to silence this warning.",
            FutureWarning,
            stacklevel=find_stack_level(),
        )
        # https://github.com/python/mypy/issues/7642
        # error: Argument 1 to "_get_tol_from_less_precise" has incompatible
        # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]"
        rtol = atol = _get_tol_from_less_precise(
            check_less_precise  # type: ignore[arg-type]
        )

    # instance validation
    _check_isinstance(left, right, Index)

    # class / dtype comparison
    _check_types(left, right, obj=obj)

    # level comparison
    if left.nlevels != right.nlevels:
        msg1 = f"{obj} levels are different"
        msg2 = f"{left.nlevels}, {left}"
        msg3 = f"{right.nlevels}, {right}"
        raise_assert_detail(obj, msg1, msg2, msg3)

    # length comparison
    if len(left) != len(right):
        msg1 = f"{obj} length are different"
        msg2 = f"{len(left)}, {left}"
        msg3 = f"{len(right)}, {right}"
        raise_assert_detail(obj, msg1, msg2, msg3)

    # If order doesn't matter then sort the index entries
    if not check_order:
        left = Index(safe_sort(left))
        right = Index(safe_sort(right))

    # MultiIndex special comparison for little-friendly error messages
    if left.nlevels > 1:
        left = cast(MultiIndex, left)
        right = cast(MultiIndex, right)

        for level in range(left.nlevels):
            # cannot use get_level_values here because it can change dtype
            llevel = _get_ilevel_values(left, level)
            rlevel = _get_ilevel_values(right, level)

            lobj = f"MultiIndex level [{level}]"
            assert_index_equal(
                llevel,
                rlevel,
                exact=exact,
                check_names=check_names,
                check_exact=check_exact,
                rtol=rtol,
                atol=atol,
                obj=lobj,
            )
            # get_level_values may change dtype
            _check_types(left.levels[level], right.levels[level], obj=obj)

    # skip exact index checking when `check_categorical` is False
    if check_exact and check_categorical:
        if not left.equals(right):
            mismatch = left._values != right._values

            diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
            msg = f"{obj} values are different ({np.round(diff, 5)} %)"
            raise_assert_detail(obj, msg, left, right)
    else:

        # if we have "equiv", this becomes True
        exact_bool = bool(exact)
        _testing.assert_almost_equal(
            left.values,
            right.values,
            rtol=rtol,
            atol=atol,
            check_dtype=exact_bool,
            obj=obj,
            lobj=left,
            robj=right,
        )

    # metadata comparison
    if check_names:
        assert_attr_equal("names", left, right, obj=obj)
    if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex):
        assert_attr_equal("freq", left, right, obj=obj)
    if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex):
        assert_interval_array_equal(left._values, right._values)

    if check_categorical:
        if is_categorical_dtype(left.dtype) or is_categorical_dtype(
                right.dtype):
            assert_categorical_equal(left._values,
                                     right._values,
                                     obj=f"{obj} category")
Ejemplo n.º 10
0
    def agg_dict_like(self, _axis: int) -> FrameOrSeriesUnion:
        """
        Compute aggregation in the case of a dict-like argument.

        Parameters
        ----------
        _axis : int, 0 or 1
            Axis to compute aggregation on.

        Returns
        -------
        Result of aggregation.
        """
        obj = self.obj
        arg = cast(AggFuncTypeDict, self.f)

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

        if _axis != 0:  # pragma: no cover
            raise ValueError("Can only pass dict with axis=0")

        selected_obj = obj._selected_obj

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        # Cannot use arg.values() because arg may be a Series
        if any(is_aggregator(x) for _, x in arg.items()):
            new_arg: AggFuncTypeDict = {}
            for k, v in arg.items():
                if not isinstance(v, (tuple, list, dict)):
                    new_arg[k] = [v]
                else:
                    new_arg[k] = v

                # the keys must be in the columns
                # for ndim=2, or renamers for ndim=1

                # ok for now, but deprecated
                # {'A': { 'ra': 'mean' }}
                # {'A': { 'ra': ['mean'] }}
                # {'ra': ['mean']}

                # not ok
                # {'ra' : { 'A' : 'mean' }}
                if isinstance(v, dict):
                    raise SpecificationError("nested renamer is not supported")
                elif isinstance(selected_obj, ABCSeries):
                    raise SpecificationError("nested renamer is not supported")
                elif (isinstance(selected_obj, ABCDataFrame)
                      and k not in selected_obj.columns):
                    raise KeyError(f"Column '{k}' does not exist!")

            arg = new_arg

        else:
            # deprecation of renaming keys
            # GH 15931
            keys = list(arg.keys())
            if isinstance(selected_obj, ABCDataFrame) and len(
                    selected_obj.columns.intersection(keys)) != len(keys):
                cols = list(
                    safe_sort(
                        list(
                            set(keys) -
                            set(selected_obj.columns.intersection(keys))), ))
                raise SpecificationError(f"Column(s) {cols} do not exist")

        from pandas.core.reshape.concat import concat

        if selected_obj.ndim == 1:
            # key only used for output
            colg = obj._gotitem(obj._selection, ndim=1)
            results = {key: colg.agg(how) for key, how in arg.items()}
        else:
            # key used for column selection and output
            results = {
                key: obj._gotitem(key, ndim=1).agg(how)
                for key, how in arg.items()
            }

        # set the final keys
        keys = list(arg.keys())

        # Avoid making two isinstance calls in all and any below
        is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]

        # combine results
        if all(is_ndframe):
            keys_to_use = [k for k in keys if not results[k].empty]
            # Have to check, if at least one DataFrame is not empty.
            keys_to_use = keys_to_use if keys_to_use != [] else keys
            axis = 0 if isinstance(obj, ABCSeries) else 1
            result = concat({k: results[k] for k in keys_to_use}, axis=axis)
        elif any(is_ndframe):
            # There is a mix of NDFrames and scalars
            raise ValueError("cannot perform both aggregation "
                             "and transformation operations "
                             "simultaneously")
        else:
            from pandas import Series

            # we have a dict of scalars
            # GH 36212 use name only if obj is a series
            if obj.ndim == 1:
                obj = cast("Series", obj)
                name = obj.name
            else:
                name = None

            result = Series(results, name=name)

        return result
Ejemplo n.º 11
0
def test_mixed_str_nan():
    values = np.array(["b", np.nan, "a", "b"], dtype=object)
    result = safe_sort(values)
    expected = np.array([np.nan, "a", "b", "b"], dtype=object)
    tm.assert_numpy_array_equal(result, expected)
Ejemplo n.º 12
0
 def test_extension_array(self, arg, exp):
     a = array(arg, dtype="Int64")
     result = safe_sort(a)
     expected = array(exp, dtype="Int64")
     tm.assert_extension_array_equal(result, expected)
Ejemplo n.º 13
0
 def test_exceptions(self, arg, codes, err, msg):
     with pytest.raises(err, match=msg):
         safe_sort(values=arg, codes=codes)
Ejemplo n.º 14
0
 def test_unsortable(self):
     # GH 13714
     arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
     msg = "'[<>]' not supported between instances of .*"
     with pytest.raises(TypeError, match=msg):
         safe_sort(arr)
Ejemplo n.º 15
0
 def test_mixed_integer(self, box):
     values = box(["b", 1, 0, "a", 0, "b"])
     result = safe_sort(values)
     expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object)
     tm.assert_numpy_array_equal(result, expected)
Ejemplo n.º 16
0
 def test_basic_sort(self, arg, exp):
     result = safe_sort(arg)
     expected = np.array(exp)
     tm.assert_numpy_array_equal(result, expected)