Esempio n. 1
0
def test_is_interval_dtype():
    assert not com.is_interval_dtype(object)
    assert not com.is_interval_dtype([1, 2, 3])

    assert com.is_interval_dtype(IntervalDtype())

    interval = pd.Interval(1, 2, closed="right")
    assert not com.is_interval_dtype(interval)
    assert com.is_interval_dtype(pd.IntervalIndex([interval]))
Esempio n. 2
0
    def test_construction_generic(self):
        # generic
        i = IntervalDtype('interval')
        assert i.subtype == ''
        assert is_interval_dtype(i)
        assert str(i) == 'interval[]'

        i = IntervalDtype()
        assert i.subtype is None
        assert is_interval_dtype(i)
        assert str(i) == 'interval'
Esempio n. 3
0
    def test_construction_generic(self):
        # generic
        i = IntervalDtype('interval')
        self.assertIs(i.subtype, None)
        self.assertTrue(is_interval_dtype(i))
        self.assertTrue(str(i) == 'interval')

        i = IntervalDtype()
        self.assertIs(i.subtype, None)
        self.assertTrue(is_interval_dtype(i))
        self.assertTrue(str(i) == 'interval')
Esempio n. 4
0
    def test_basic(self):
        assert is_interval_dtype(self.dtype)

        ii = IntervalIndex.from_breaks(range(3))

        assert is_interval_dtype(ii.dtype)
        assert is_interval_dtype(ii)

        s = Series(ii, name='A')

        assert is_interval_dtype(s.dtype)
        assert is_interval_dtype(s)
Esempio n. 5
0
 def test_basic_dtype(self):
     assert is_interval_dtype('interval[int64]')
     assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)]))
     assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4)))
     assert is_interval_dtype(IntervalIndex.from_breaks(
         date_range('20130101', periods=3)))
     assert not is_interval_dtype('U')
     assert not is_interval_dtype('S')
     assert not is_interval_dtype('foo')
     assert not is_interval_dtype(np.object_)
     assert not is_interval_dtype(np.int64)
     assert not is_interval_dtype(np.float64)
Esempio n. 6
0
    def test_basic(self):
        assert is_interval_dtype(self.dtype)

        ii = IntervalIndex.from_breaks(range(3))

        assert is_interval_dtype(ii.dtype)
        assert is_interval_dtype(ii)

        s = Series(ii, name='A')

        # dtypes
        # series results in object dtype currently,
        assert not is_interval_dtype(s.dtype)
        assert not is_interval_dtype(s)
Esempio n. 7
0
    def test_basic(self):
        self.assertTrue(is_interval_dtype(self.dtype))

        ii = IntervalIndex.from_breaks(range(3))

        self.assertTrue(is_interval_dtype(ii.dtype))
        self.assertTrue(is_interval_dtype(ii))

        s = Series(ii, name='A')

        # dtypes
        # series results in object dtype currently,
        self.assertFalse(is_interval_dtype(s.dtype))
        self.assertFalse(is_interval_dtype(s))
Esempio n. 8
0
    def __new__(cls, data, closed=None, dtype=None, copy=False,
                verify_integrity=True):

        if isinstance(data, ABCSeries) and is_interval_dtype(data):
            data = data.values

        if isinstance(data, (cls, ABCIntervalIndex)):
            left = data.left
            right = data.right
            closed = closed or data.closed
        else:

            # don't allow scalars
            if is_scalar(data):
                msg = ("{}(...) must be called with a collection of some kind,"
                       " {} was passed")
                raise TypeError(msg.format(cls.__name__, data))

            # might need to convert empty or purely na data
            data = maybe_convert_platform_interval(data)
            left, right, infer_closed = intervals_to_interval_bounds(
                data, validate_closed=closed is None)
            closed = closed or infer_closed

        return cls._simple_new(left, right, closed, copy=copy, dtype=dtype,
                               verify_integrity=verify_integrity)
Esempio n. 9
0
    def test_construction(self):
        with tm.assertRaises(ValueError):
            IntervalDtype('xx')

        for s in ['interval[int64]', 'Interval[int64]', 'int64']:
            i = IntervalDtype(s)
            self.assertEqual(i.subtype, np.dtype('int64'))
            self.assertTrue(is_interval_dtype(i))
Esempio n. 10
0
    def test_construction(self):
        with pytest.raises(ValueError):
            IntervalDtype('xx')

        for s in ['interval[int64]', 'Interval[int64]', 'int64']:
            i = IntervalDtype(s)
            assert i.subtype == np.dtype('int64')
            assert is_interval_dtype(i)
Esempio n. 11
0
    def astype(self, dtype, copy=True):
        if is_interval_dtype(dtype):
            from pandas import IntervalIndex
            return IntervalIndex(np.array(self))
        elif is_categorical_dtype(dtype):
            # GH 18630
            dtype = self.dtype.update_dtype(dtype)
            if dtype == self.dtype:
                return self.copy() if copy else self

        return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
Esempio n. 12
0
 def astype(self, dtype, copy=True):
     if is_interval_dtype(dtype):
         if copy:
             self = self.copy()
         return self
     elif is_object_dtype(dtype):
         return Index(self.values, dtype=object)
     elif is_categorical_dtype(dtype):
         from pandas import Categorical
         return Categorical(self, ordered=True)
     raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
Esempio n. 13
0
    def _maybe_convert_i8(self, key):
        """
        Maybe convert a given key to it's equivalent i8 value(s). Used as a
        preprocessing step prior to IntervalTree queries (self._engine), which
        expects numeric data.

        Parameters
        ----------
        key : scalar or list-like
            The key that should maybe be converted to i8.

        Returns
        -------
        key: scalar or list-like
            The original key if no conversion occured, int if converted scalar,
            Int64Index if converted list-like.
        """
        original = key
        if is_list_like(key):
            key = ensure_index(key)

        if not self._needs_i8_conversion(key):
            return original

        scalar = is_scalar(key)
        if is_interval_dtype(key) or isinstance(key, Interval):
            # convert left/right and reconstruct
            left = self._maybe_convert_i8(key.left)
            right = self._maybe_convert_i8(key.right)
            constructor = Interval if scalar else IntervalIndex.from_arrays
            return constructor(left, right, closed=self.closed)

        if scalar:
            # Timestamp/Timedelta
            key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True)
        else:
            # DatetimeIndex/TimedeltaIndex
            key_dtype, key_i8 = key.dtype, Index(key.asi8)
            if key.hasnans:
                # convert NaT from it's i8 value to np.nan so it's not viewed
                # as a valid value, maybe causing errors (e.g. is_overlapping)
                key_i8 = key_i8.where(~key._isnan)

        # ensure consistency with IntervalIndex subtype
        subtype = self.dtype.subtype
        msg = ('Cannot index an IntervalIndex of subtype {subtype} with '
               'values of dtype {other}')
        if not is_dtype_equal(subtype, key_dtype):
            raise ValueError(msg.format(subtype=subtype, other=key_dtype))

        return key_i8
Esempio n. 14
0
    def test_setitem(self):

        df = DataFrame({'A': range(10)})
        s = pd.cut(df.A, 5)
        assert isinstance(s.cat.categories, IntervalIndex)

        # B & D end up as Categoricals
        # the remainer are converted to in-line objects
        # contining an IntervalIndex.values
        df['B'] = s
        df['C'] = np.array(s)
        df['D'] = s.values
        df['E'] = np.array(s.values)

        assert is_categorical_dtype(df['B'])
        assert is_interval_dtype(df['B'].cat.categories)
        assert is_categorical_dtype(df['D'])
        assert is_interval_dtype(df['D'].cat.categories)

        assert is_object_dtype(df['C'])
        assert is_object_dtype(df['E'])

        # they compare equal as Index
        # when converted to numpy objects
        c = lambda x: Index(np.array(x))
        tm.assert_index_equal(c(df.B), c(df.B), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.C), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.D), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.D), check_names=False)

        # B & D are the same Series
        tm.assert_series_equal(df['B'], df['B'], check_names=False)
        tm.assert_series_equal(df['B'], df['D'], check_names=False)

        # C & E are the same Series
        tm.assert_series_equal(df['C'], df['C'], check_names=False)
        tm.assert_series_equal(df['C'], df['E'], check_names=False)
Esempio n. 15
0
    def _simple_new(cls, left, right, closed=None,
                    copy=False, dtype=None, verify_integrity=True):
        result = IntervalMixin.__new__(cls)

        closed = closed or 'right'
        left = ensure_index(left, copy=copy)
        right = ensure_index(right, copy=copy)

        if dtype is not None:
            # GH 19262: dtype must be an IntervalDtype to override inferred
            dtype = pandas_dtype(dtype)
            if not is_interval_dtype(dtype):
                msg = 'dtype must be an IntervalDtype, got {dtype}'
                raise TypeError(msg.format(dtype=dtype))
            elif dtype.subtype is not None:
                left = left.astype(dtype.subtype)
                right = right.astype(dtype.subtype)

        # coerce dtypes to match if needed
        if is_float_dtype(left) and is_integer_dtype(right):
            right = right.astype(left.dtype)
        elif is_float_dtype(right) and is_integer_dtype(left):
            left = left.astype(right.dtype)

        if type(left) != type(right):
            msg = ('must not have differing left [{ltype}] and right '
                   '[{rtype}] types')
            raise ValueError(msg.format(ltype=type(left).__name__,
                                        rtype=type(right).__name__))
        elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
            # GH 19016
            msg = ('category, object, and string subtypes are not supported '
                   'for IntervalArray')
            raise TypeError(msg)
        elif isinstance(left, ABCPeriodIndex):
            msg = 'Period dtypes are not supported, use a PeriodIndex instead'
            raise ValueError(msg)
        elif (isinstance(left, ABCDatetimeIndex) and
                str(left.tz) != str(right.tz)):
            msg = ("left and right must have the same time zone, got "
                   "'{left_tz}' and '{right_tz}'")
            raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz))

        result._left = left
        result._right = right
        result._closed = closed
        if verify_integrity:
            result._validate()
        return result
Esempio n. 16
0
    def equals(self, other):

        if self.is_(other):
            return True

        # if we can coerce to an II
        # then we can compare
        if not isinstance(other, IntervalIndex):
            if not is_interval_dtype(other):
                return False
            other = Index(getattr(other, '.values', other))

        return (self.left.equals(other.left) and
                self.right.equals(other.right) and
                self.closed == other.closed)
Esempio n. 17
0
    def equals(self, other):
        """
        Determines if two IntervalIndex objects contain the same elements
        """
        if self.is_(other):
            return True

        # if we can coerce to an II
        # then we can compare
        if not isinstance(other, IntervalIndex):
            if not is_interval_dtype(other):
                return False
            other = Index(getattr(other, '.values', other))

        return (self.left.equals(other.left) and
                self.right.equals(other.right) and
                self.closed == other.closed)
Esempio n. 18
0
    def __setitem__(self, key, value):
        # na value: need special casing to set directly on numpy arrays
        needs_float_conversion = False
        if is_scalar(value) and isna(value):
            if is_integer_dtype(self.dtype.subtype):
                # can't set NaN on a numpy integer array
                needs_float_conversion = True
            elif is_datetime64_any_dtype(self.dtype.subtype):
                # need proper NaT to set directly on the numpy array
                value = np.datetime64('NaT')
            elif is_timedelta64_dtype(self.dtype.subtype):
                # need proper NaT to set directly on the numpy array
                value = np.timedelta64('NaT')
            value_left, value_right = value, value

        # scalar interval
        elif is_interval_dtype(value) or isinstance(value, ABCInterval):
            self._check_closed_matches(value, name="value")
            value_left, value_right = value.left, value.right

        else:
            # list-like of intervals
            try:
                array = IntervalArray(value)
                value_left, value_right = array.left, array.right
            except TypeError:
                # wrong type: not interval or NA
                msg = "'value' should be an interval type, got {} instead."
                raise TypeError(msg.format(type(value)))

        # Need to ensure that left and right are updated atomically, so we're
        # forced to copy, update the copy, and swap in the new values.
        left = self.left.copy(deep=True)
        if needs_float_conversion:
            left = left.astype('float')
        left.values[key] = value_left
        self._left = left

        right = self.right.copy(deep=True)
        if needs_float_conversion:
            right = right.astype('float')
        right.values[key] = value_right
        self._right = right
Esempio n. 19
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if is_categorical_dtype(dtype):
            typ = 'category'
        elif is_sparse(arr):
            typ = 'sparse'
        elif isinstance(arr, ABCRangeIndex):
            typ = 'range'
        elif is_datetimetz(arr):
            # if to_concat contains different tz,
            # the result must be object dtype
            typ = str(arr.dtype)
        elif is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif is_object_dtype(dtype):
            typ = 'object'
        elif is_bool_dtype(dtype):
            typ = 'bool'
        elif is_period_dtype(dtype):
            typ = str(arr.dtype)
        elif is_interval_dtype(dtype):
            typ = str(arr.dtype)
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Esempio n. 20
0
    def astype(self, dtype, copy=True):
        """
        Cast to an ExtensionArray or NumPy array with dtype 'dtype'.

        Parameters
        ----------
        dtype : str or dtype
            Typecode or data-type to which the array is cast.

        copy : bool, default True
            Whether to copy the data, even if not necessary. If False,
            a copy is made only if the old dtype does not match the
            new dtype.

        Returns
        -------
        array : ExtensionArray or ndarray
            ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
        """
        dtype = pandas_dtype(dtype)
        if is_interval_dtype(dtype):
            if dtype == self.dtype:
                return self.copy() if copy else self

            # need to cast to different subtype
            try:
                new_left = self.left.astype(dtype.subtype)
                new_right = self.right.astype(dtype.subtype)
            except TypeError:
                msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are '
                       'incompatible')
                raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
            return self._shallow_copy(new_left, new_right)
        elif is_categorical_dtype(dtype):
            return Categorical(np.asarray(self))
        # TODO: This try/except will be repeated.
        try:
            return np.asarray(self).astype(dtype, copy=copy)
        except (TypeError, ValueError):
            msg = 'Cannot cast {name} to dtype {dtype}'
            raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
Esempio n. 21
0
    def _needs_i8_conversion(self, key):
        """
        Check if a given key needs i8 conversion. Conversion is necessary for
        Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An
        Interval-like requires conversion if it's endpoints are one of the
        aforementioned types.

        Assumes that any list-like data has already been cast to an Index.

        Parameters
        ----------
        key : scalar or Index-like
            The key that should be checked for i8 conversion

        Returns
        -------
        boolean
        """
        if is_interval_dtype(key) or isinstance(key, Interval):
            return self._needs_i8_conversion(key.left)

        i8_types = (Timestamp, Timedelta, DatetimeIndex, TimedeltaIndex)
        return isinstance(key, i8_types)
Esempio n. 22
0
    def test_convert_dtypes(self, data, maindtype, params, answerdict):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)
        answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)}

        ns = series.convert_dtypes(*params)
        expected_dtype = answers[tuple(params)]
        expected = pd.Series(series.values, dtype=expected_dtype)
        tm.assert_series_equal(ns, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]:
            msg = "Cannot set float NaN to integer-backed IntervalArray"
            with pytest.raises(ValueError, match=msg):
                ns[ns.notna()] = np.nan
        else:
            ns[ns.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)
Esempio n. 23
0
    def test_convert_dtypes(self, data, maindtype, params, expected_default,
                            expected_other):
        if maindtype is not None:
            series = pd.Series(data, dtype=maindtype)
        else:
            series = pd.Series(data)

        result = series.convert_dtypes(*params)

        param_names = [
            "infer_objects",
            "convert_string",
            "convert_integer",
            "convert_boolean",
        ]
        params_dict = dict(zip(param_names, params))

        expected_dtype = expected_default
        for (key, val), dtype in expected_other.items():
            if params_dict[key] is val:
                expected_dtype = dtype

        expected = pd.Series(data, dtype=expected_dtype)
        tm.assert_series_equal(result, expected)

        # Test that it is a copy
        copy = series.copy(deep=True)
        if is_interval_dtype(
                result.dtype) and result.dtype.subtype.kind in ["i", "u"]:
            msg = "Cannot set float NaN to integer-backed IntervalArray"
            with pytest.raises(ValueError, match=msg):
                result[result.notna()] = np.nan
        else:
            result[result.notna()] = np.nan

        # Make sure original not changed
        tm.assert_series_equal(series, copy)
Esempio n. 24
0
    def __new__(cls,
                data,
                closed=None,
                dtype=None,
                copy=False,
                verify_integrity=True):

        if isinstance(data, ABCSeries) and is_interval_dtype(data):
            data = data.values

        if isinstance(data, (cls, ABCIntervalIndex)):
            left = data.left
            right = data.right
            closed = closed or data.closed
        else:

            # don't allow scalars
            if is_scalar(data):
                msg = (f"{cls.__name__}(...) must be called with a collection "
                       f"of some kind, {data} was passed")
                raise TypeError(msg)

            # might need to convert empty or purely na data
            data = maybe_convert_platform_interval(data)
            left, right, infer_closed = intervals_to_interval_bounds(
                data, validate_closed=closed is None)
            closed = closed or infer_closed

        return cls._simple_new(
            left,
            right,
            closed,
            copy=copy,
            dtype=dtype,
            verify_integrity=verify_integrity,
        )
Esempio n. 25
0
 def astype(self, dtype, copy=True):
     if is_interval_dtype(dtype):
         from pandas import IntervalIndex
         return IntervalIndex.from_intervals(np.array(self))
     return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
Esempio n. 26
0
 def test_construction_generic(self, subtype):
     # generic
     i = IntervalDtype(subtype)
     assert i.subtype is None
     assert is_interval_dtype(i)
Esempio n. 27
0
 def test_construction(self, subtype):
     i = IntervalDtype(subtype)
     assert i.subtype == np.dtype('int64')
     assert is_interval_dtype(i)
Esempio n. 28
0
def assert_series_equal(
    left,
    right,
    check_dtype=True,
    check_index_type="equiv",
    check_series_type=True,
    check_less_precise=no_default,
    check_names=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    check_freq=True,
    check_flags=True,
    rtol=1.0e-5,
    atol=1.0e-8,
    obj="Series",
    *,
    check_index=True,
):
    """
    Check that left and right Series are equal.

    Parameters
    ----------
    left : Series
    right : Series
    check_dtype : bool, default True
        Whether to check the Series dtype is identical.
    check_index_type : bool or {'equiv'}, default 'equiv'
        Whether to check the Index class, dtype and inferred_type
        are identical.
    check_series_type : bool, default True
         Whether to check the Series class is identical.
    check_less_precise : bool or int, default False
        Specify comparison precision. Only used when check_exact is False.
        5 digits (False) or 3 digits (True) after decimal points are compared.
        If int, then specify the digits to compare.

        When comparing two numbers, if the first number has magnitude less
        than 1e-5, we compare the two numbers directly and check whether
        they are equivalent within the specified precision. Otherwise, we
        compare the **ratio** of the second number to the first number and
        check whether it is equivalent to 1 within the specified precision.

        .. deprecated:: 1.1.0
           Use `rtol` and `atol` instead to define relative/absolute
           tolerance, respectively. Similar to :func:`math.isclose`.
    check_names : bool, default True
        Whether to check the Series and Index names attribute.
    check_exact : bool, default False
        Whether to compare number exactly.
    check_datetimelike_compat : bool, default False
        Compare datetime-like which is comparable ignoring dtype.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_category_order : bool, default True
        Whether to compare category order of internal Categoricals.

        .. versionadded:: 1.0.2
    check_freq : bool, default True
        Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.

        .. versionadded:: 1.1.0
    check_flags : bool, default True
        Whether to check the `flags` attribute.

        .. versionadded:: 1.2.0

    rtol : float, default 1e-5
        Relative tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    atol : float, default 1e-8
        Absolute tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    obj : str, default 'Series'
        Specify object name being compared, internally used to show appropriate
        assertion message.
    check_index : bool, default True
        Whether to check index equivalence. If False, then compare only values.

        .. versionadded:: 1.3.0

    Examples
    --------
    >>> from pandas import testing as tm
    >>> a = pd.Series([1, 2, 3, 4])
    >>> b = pd.Series([1, 2, 3, 4])
    >>> tm.assert_series_equal(a, b)
    """
    __tracebackhide__ = True

    if check_less_precise is not no_default:
        warnings.warn(
            "The 'check_less_precise' keyword in testing.assert_*_equal "
            "is deprecated and will be removed in a future version. "
            "You can stop passing 'check_less_precise' to silence this warning.",
            FutureWarning,
            stacklevel=find_stack_level(),
        )
        rtol = atol = _get_tol_from_less_precise(check_less_precise)

    # instance validation
    _check_isinstance(left, right, Series)

    if check_series_type:
        assert_class_equal(left, right, obj=obj)

    # length comparison
    if len(left) != len(right):
        msg1 = f"{len(left)}, {left.index}"
        msg2 = f"{len(right)}, {right.index}"
        raise_assert_detail(obj, "Series length are different", msg1, msg2)

    if check_flags:
        assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}"

    if check_index:
        # GH #38183
        assert_index_equal(
            left.index,
            right.index,
            exact=check_index_type,
            check_names=check_names,
            check_exact=check_exact,
            check_categorical=check_categorical,
            rtol=rtol,
            atol=atol,
            obj=f"{obj}.index",
        )

    if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)):
        lidx = left.index
        ridx = right.index
        assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq)

    if check_dtype:
        # We want to skip exact dtype checking when `check_categorical`
        # is False. We'll still raise if only one is a `Categorical`,
        # regardless of `check_categorical`
        if (isinstance(left.dtype, CategoricalDtype)
                and isinstance(right.dtype, CategoricalDtype)
                and not check_categorical):
            pass
        else:
            assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")

    if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(
            right.dtype):
        left_values = left._values
        right_values = right._values
        # Only check exact if dtype is numeric
        if isinstance(left_values, ExtensionArray) and isinstance(
                right_values, ExtensionArray):
            assert_extension_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                index_values=np.asarray(left.index),
            )
        else:
            assert_numpy_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                obj=str(obj),
                index_values=np.asarray(left.index),
            )
    elif check_datetimelike_compat and (needs_i8_conversion(left.dtype)
                                        or needs_i8_conversion(right.dtype)):
        # we want to check only if we have compat dtypes
        # e.g. integer and M|m are NOT compat, but we can simply check
        # the values in that case

        # datetimelike may have different objects (e.g. datetime.datetime
        # vs Timestamp) but will compare equal
        if not Index(left._values).equals(Index(right._values)):
            msg = (f"[datetimelike_compat=True] {left._values} "
                   f"is not equal to {right._values}.")
            raise AssertionError(msg)
    elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype):
        assert_interval_array_equal(left.array, right.array)
    elif isinstance(left.dtype, CategoricalDtype) or isinstance(
            right.dtype, CategoricalDtype):
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(
            right.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype_and_needs_i8_conversion(
            left.dtype,
            right.dtype) or is_extension_array_dtype_and_needs_i8_conversion(
                right.dtype, left.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
        # DatetimeArray or TimedeltaArray
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    else:
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )

    # metadata comparison
    if check_names:
        assert_attr_equal("name", left, right, obj=obj)

    if check_categorical:
        if isinstance(left.dtype, CategoricalDtype) or isinstance(
                right.dtype, CategoricalDtype):
            assert_categorical_equal(
                left._values,
                right._values,
                obj=f"{obj} category",
                check_category_order=check_category_order,
            )
Esempio n. 29
0
 def astype(self, dtype, copy=True):
     with rewrite_exception('IntervalArray', self.__class__.__name__):
         new_values = self.values.astype(dtype, copy=copy)
     if is_interval_dtype(new_values):
         return self._shallow_copy(new_values.left, new_values.right)
     return super(IntervalIndex, self).astype(dtype, copy=copy)
Esempio n. 30
0
 def test_construction(self, subtype):
     i = IntervalDtype(subtype)
     assert i.subtype == np.dtype('int64')
     assert is_interval_dtype(i)
Esempio n. 31
0
def to_orc(
    df: DataFrame,
    path: FilePath | WriteBuffer[bytes] | None = None,
    *,
    engine: Literal["pyarrow"] = "pyarrow",
    index: bool | None = None,
    engine_kwargs: dict[str, Any] | None = None,
) -> bytes | None:
    """
    Write a DataFrame to the ORC format.

    .. versionadded:: 1.5.0

    Parameters
    ----------
    df : DataFrame
        The dataframe to be written to ORC. Raises NotImplementedError
        if dtype of one or more columns is category, unsigned integers,
        intervals, periods or sparse.
    path : str, file-like object or None, default None
        If a string, it will be used as Root Directory path
        when writing a partitioned dataset. By file-like object,
        we refer to objects with a write() method, such as a file handle
        (e.g. via builtin open function). If path is None,
        a bytes object is returned.
    engine : str, default 'pyarrow'
        ORC library to use. Pyarrow must be >= 7.0.0.
    index : bool, optional
        If ``True``, include the dataframe's index(es) in the file output. If
        ``False``, they will not be written to the file.
        If ``None``, similar to ``infer`` the dataframe's index(es)
        will be saved. However, instead of being saved as values,
        the RangeIndex will be stored as a range in the metadata so it
        doesn't require much space and is faster. Other indexes will
        be included as columns in the file output.
    engine_kwargs : dict[str, Any] or None, default None
        Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

    Returns
    -------
    bytes if no path argument is provided else None

    Raises
    ------
    NotImplementedError
        Dtype of one or more columns is category, unsigned integers, interval,
        period or sparse.
    ValueError
        engine is not pyarrow.

    Notes
    -----
    * Before using this function you should read the
      :ref:`user guide about ORC <io.orc>` and
      :ref:`install optional dependencies <install.warn_orc>`.
    * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
      library.
    * For supported dtypes please refer to `supported ORC features in Arrow
      <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
    * Currently timezones in datetime columns are not preserved when a
      dataframe is converted into ORC files.
    """
    if index is None:
        index = df.index.names[0] is not None
    if engine_kwargs is None:
        engine_kwargs = {}

    # If unsupported dtypes are found raise NotImplementedError
    # In Pyarrow 9.0.0 this check will no longer be needed
    for dtype in df.dtypes:
        if (is_categorical_dtype(dtype) or is_interval_dtype(dtype)
                or is_period_dtype(dtype) or is_unsigned_integer_dtype(dtype)):
            raise NotImplementedError(
                "The dtype of one or more columns is not supported yet.")

    if engine != "pyarrow":
        raise ValueError("engine must be 'pyarrow'")
    engine = import_optional_dependency(engine, min_version="7.0.0")
    orc = import_optional_dependency("pyarrow.orc")

    was_none = path is None
    if was_none:
        path = io.BytesIO()
    assert path is not None  # For mypy
    with get_handle(path, "wb", is_text=False) as handles:
        assert isinstance(engine, ModuleType)  # For mypy
        try:
            orc.write_table(
                engine.Table.from_pandas(df, preserve_index=index),
                handles.handle,
                **engine_kwargs,
            )
        except TypeError as e:
            raise NotImplementedError(
                "The dtype of one or more columns is not supported yet."
            ) from e

    if was_none:
        assert isinstance(path, io.BytesIO)  # For mypy
        return path.getvalue()
    return None
Esempio n. 32
0
 def astype(self, dtype, copy=True):
     with rewrite_exception("IntervalArray", type(self).__name__):
         new_values = self._values.astype(dtype, copy=copy)
     if is_interval_dtype(new_values.dtype):
         return self._shallow_copy(new_values)
     return Index.astype(self, dtype, copy=copy)
Esempio n. 33
0
    def _maybe_convert_i8(self, key):
        """
        Maybe convert a given key to its equivalent i8 value(s). Used as a
        preprocessing step prior to IntervalTree queries (self._engine), which
        expects numeric data.

        Parameters
        ----------
        key : scalar or list-like
            The key that should maybe be converted to i8.

        Returns
        -------
        scalar or list-like
            The original key if no conversion occurred, int if converted scalar,
            Int64Index if converted list-like.
        """
        original = key
        if is_list_like(key):
            key = ensure_index(key)

        if not self._needs_i8_conversion(key):
            return original

        scalar = is_scalar(key)
        if is_interval_dtype(key) or isinstance(key, Interval):
            # convert left/right and reconstruct
            left = self._maybe_convert_i8(key.left)
            right = self._maybe_convert_i8(key.right)
            constructor = Interval if scalar else IntervalIndex.from_arrays
            # error: "object" not callable
            return constructor(left, right,
                               closed=self.closed)  # type: ignore[operator]

        if scalar:
            # Timestamp/Timedelta
            key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True)
            if lib.is_period(key):
                key_i8 = key.ordinal
            elif isinstance(key_i8, Timestamp):
                key_i8 = key_i8.value
            elif isinstance(key_i8, (np.datetime64, np.timedelta64)):
                key_i8 = key_i8.view("i8")
        else:
            # DatetimeIndex/TimedeltaIndex
            key_dtype, key_i8 = key.dtype, Index(key.asi8)
            if key.hasnans:
                # convert NaT from its i8 value to np.nan so it's not viewed
                # as a valid value, maybe causing errors (e.g. is_overlapping)
                key_i8 = key_i8.where(~key._isnan)

        # ensure consistency with IntervalIndex subtype
        # error: Item "ExtensionDtype"/"dtype[Any]" of "Union[dtype[Any],
        # ExtensionDtype]" has no attribute "subtype"
        subtype = self.dtype.subtype  # type: ignore[union-attr]

        if not is_dtype_equal(subtype, key_dtype):
            raise ValueError(
                f"Cannot index an IntervalIndex of subtype {subtype} with "
                f"values of dtype {key_dtype}")

        return key_i8
Esempio n. 34
0
 def astype(self, dtype, copy=True):
     with rewrite_exception('IntervalArray', self.__class__.__name__):
         new_values = self.values.astype(dtype, copy=copy)
     if is_interval_dtype(new_values):
         return self._shallow_copy(new_values.left, new_values.right)
     return super().astype(dtype, copy=copy)
Esempio n. 35
0
    def _cmp_method(self, other, op):
        # ensure pandas array for list-like and eliminate non-interval scalars
        if is_list_like(other):
            if len(self) != len(other):
                raise ValueError("Lengths must match to compare")
            other = array(other)
        elif not isinstance(other, Interval):
            # non-interval scalar -> no matches
            return invalid_comparison(self, other, op)

        # determine the dtype of the elements we want to compare
        if isinstance(other, Interval):
            other_dtype = pandas_dtype("interval")
        elif not is_categorical_dtype(other.dtype):
            other_dtype = other.dtype
        else:
            # for categorical defer to categories for dtype
            other_dtype = other.categories.dtype

            # extract intervals if we have interval categories with matching closed
            if is_interval_dtype(other_dtype):
                if self.closed != other.categories.closed:
                    return invalid_comparison(self, other, op)

                other = other.categories.take(
                    other.codes,
                    allow_fill=True,
                    fill_value=other.categories._na_value)

        # interval-like -> need same closed and matching endpoints
        if is_interval_dtype(other_dtype):
            if self.closed != other.closed:
                return invalid_comparison(self, other, op)
            elif not isinstance(other, Interval):
                other = type(self)(other)

            if op is operator.eq:
                return (self._left == other.left) & (self._right
                                                     == other.right)
            elif op is operator.ne:
                return (self._left != other.left) | (self._right !=
                                                     other.right)
            elif op is operator.gt:
                return (self._left > other.left) | (
                    (self._left == other.left) & (self._right > other.right))
            elif op is operator.ge:
                return (self == other) | (self > other)
            elif op is operator.lt:
                return (self._left < other.left) | (
                    (self._left == other.left) & (self._right < other.right))
            else:
                # operator.lt
                return (self == other) | (self < other)

        # non-interval/non-object dtype -> no matches
        if not is_object_dtype(other_dtype):
            return invalid_comparison(self, other, op)

        # object dtype -> iteratively check for intervals
        result = np.zeros(len(self), dtype=bool)
        for i, obj in enumerate(other):
            try:
                result[i] = op(self[i], obj)
            except TypeError:
                if obj is NA:
                    # comparison with np.nan returns NA
                    # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092
                    result[i] = op is operator.ne
                else:
                    raise
        return result
Esempio n. 36
0
 def test_construction(self, subtype):
     i = IntervalDtype(subtype, closed="right")
     assert i.subtype == np.dtype("int64")
     assert is_interval_dtype(i)
Esempio n. 37
0
 def astype(self, dtype, copy=True):
     if is_interval_dtype(dtype):
         from pandas import IntervalIndex
         return IntervalIndex.from_intervals(np.array(self))
     return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
Esempio n. 38
0
 def astype(self, dtype, copy=True):
     if is_interval_dtype(dtype):
         return self.copy() if copy else self
     return super(IntervalIndex, self).astype(dtype, copy=copy)
Esempio n. 39
0
 def test_construction_generic(self, subtype):
     # generic
     i = IntervalDtype(subtype)
     assert i.subtype is None
     assert is_interval_dtype(i)
Esempio n. 40
0
 def test_construction_string_regex(self, subtype):
     i = IntervalDtype(subtype=subtype)
     assert i.subtype == np.dtype("int64")
     assert i.inclusive == "right"
     assert is_interval_dtype(i)
Esempio n. 41
0
 def astype(self, dtype, copy=True):
     if is_interval_dtype(dtype):
         return self.copy() if copy else self
     return super(IntervalIndex, self).astype(dtype, copy=copy)
Esempio n. 42
0
    def _simple_new(
        cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True
    ):
        result = IntervalMixin.__new__(cls)

        closed = closed or "right"
        left = ensure_index(left, copy=copy)
        right = ensure_index(right, copy=copy)

        if dtype is not None:
            # GH 19262: dtype must be an IntervalDtype to override inferred
            dtype = pandas_dtype(dtype)
            if not is_interval_dtype(dtype):
                msg = f"dtype must be an IntervalDtype, got {dtype}"
                raise TypeError(msg)
            elif dtype.subtype is not None:
                left = left.astype(dtype.subtype)
                right = right.astype(dtype.subtype)

        # coerce dtypes to match if needed
        if is_float_dtype(left) and is_integer_dtype(right):
            right = right.astype(left.dtype)
        elif is_float_dtype(right) and is_integer_dtype(left):
            left = left.astype(right.dtype)

        if type(left) != type(right):
            msg = (
                f"must not have differing left [{type(left).__name__}] and "
                f"right [{type(right).__name__}] types"
            )
            raise ValueError(msg)
        elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
            # GH 19016
            msg = (
                "category, object, and string subtypes are not supported "
                "for IntervalArray"
            )
            raise TypeError(msg)
        elif isinstance(left, ABCPeriodIndex):
            msg = "Period dtypes are not supported, use a PeriodIndex instead"
            raise ValueError(msg)
        elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz):
            msg = (
                "left and right must have the same time zone, got "
                f"'{left.tz}' and '{right.tz}'"
            )
            raise ValueError(msg)

        # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray
        from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array

        left = maybe_upcast_datetimelike_array(left)
        left = extract_array(left, extract_numpy=True)
        right = maybe_upcast_datetimelike_array(right)
        right = extract_array(right, extract_numpy=True)

        lbase = getattr(left, "_ndarray", left).base
        rbase = getattr(right, "_ndarray", right).base
        if lbase is not None and lbase is rbase:
            # If these share data, then setitem could corrupt our IA
            right = right.copy()

        result._left = left
        result._right = right
        result._closed = closed
        if verify_integrity:
            result._validate()
        return result