def test_is_interval_dtype(): assert not com.is_interval_dtype(object) assert not com.is_interval_dtype([1, 2, 3]) assert com.is_interval_dtype(IntervalDtype()) interval = pd.Interval(1, 2, closed="right") assert not com.is_interval_dtype(interval) assert com.is_interval_dtype(pd.IntervalIndex([interval]))
def test_construction_generic(self): # generic i = IntervalDtype('interval') assert i.subtype == '' assert is_interval_dtype(i) assert str(i) == 'interval[]' i = IntervalDtype() assert i.subtype is None assert is_interval_dtype(i) assert str(i) == 'interval'
def test_construction_generic(self): # generic i = IntervalDtype('interval') self.assertIs(i.subtype, None) self.assertTrue(is_interval_dtype(i)) self.assertTrue(str(i) == 'interval') i = IntervalDtype() self.assertIs(i.subtype, None) self.assertTrue(is_interval_dtype(i)) self.assertTrue(str(i) == 'interval')
def test_basic(self): assert is_interval_dtype(self.dtype) ii = IntervalIndex.from_breaks(range(3)) assert is_interval_dtype(ii.dtype) assert is_interval_dtype(ii) s = Series(ii, name='A') assert is_interval_dtype(s.dtype) assert is_interval_dtype(s)
def test_basic_dtype(self): assert is_interval_dtype('interval[int64]') assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])) assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4))) assert is_interval_dtype(IntervalIndex.from_breaks( date_range('20130101', periods=3))) assert not is_interval_dtype('U') assert not is_interval_dtype('S') assert not is_interval_dtype('foo') assert not is_interval_dtype(np.object_) assert not is_interval_dtype(np.int64) assert not is_interval_dtype(np.float64)
def test_basic(self): assert is_interval_dtype(self.dtype) ii = IntervalIndex.from_breaks(range(3)) assert is_interval_dtype(ii.dtype) assert is_interval_dtype(ii) s = Series(ii, name='A') # dtypes # series results in object dtype currently, assert not is_interval_dtype(s.dtype) assert not is_interval_dtype(s)
def test_basic(self): self.assertTrue(is_interval_dtype(self.dtype)) ii = IntervalIndex.from_breaks(range(3)) self.assertTrue(is_interval_dtype(ii.dtype)) self.assertTrue(is_interval_dtype(ii)) s = Series(ii, name='A') # dtypes # series results in object dtype currently, self.assertFalse(is_interval_dtype(s.dtype)) self.assertFalse(is_interval_dtype(s))
def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): if isinstance(data, ABCSeries) and is_interval_dtype(data): data = data.values if isinstance(data, (cls, ABCIntervalIndex)): left = data.left right = data.right closed = closed or data.closed else: # don't allow scalars if is_scalar(data): msg = ("{}(...) must be called with a collection of some kind," " {} was passed") raise TypeError(msg.format(cls.__name__, data)) # might need to convert empty or purely na data data = maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds( data, validate_closed=closed is None) closed = closed or infer_closed return cls._simple_new(left, right, closed, copy=copy, dtype=dtype, verify_integrity=verify_integrity)
def test_construction(self): with tm.assertRaises(ValueError): IntervalDtype('xx') for s in ['interval[int64]', 'Interval[int64]', 'int64']: i = IntervalDtype(s) self.assertEqual(i.subtype, np.dtype('int64')) self.assertTrue(is_interval_dtype(i))
def test_construction(self): with pytest.raises(ValueError): IntervalDtype('xx') for s in ['interval[int64]', 'Interval[int64]', 'int64']: i = IntervalDtype(s) assert i.subtype == np.dtype('int64') assert is_interval_dtype(i)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 dtype = self.dtype.update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): if copy: self = self.copy() return self elif is_object_dtype(dtype): return Index(self.values, dtype=object) elif is_categorical_dtype(dtype): from pandas import Categorical return Categorical(self, ordered=True) raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
def _maybe_convert_i8(self, key): """ Maybe convert a given key to it's equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. Parameters ---------- key : scalar or list-like The key that should maybe be converted to i8. Returns ------- key: scalar or list-like The original key if no conversion occured, int if converted scalar, Int64Index if converted list-like. """ original = key if is_list_like(key): key = ensure_index(key) if not self._needs_i8_conversion(key): return original scalar = is_scalar(key) if is_interval_dtype(key) or isinstance(key, Interval): # convert left/right and reconstruct left = self._maybe_convert_i8(key.left) right = self._maybe_convert_i8(key.right) constructor = Interval if scalar else IntervalIndex.from_arrays return constructor(left, right, closed=self.closed) if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: # convert NaT from it's i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype msg = ('Cannot index an IntervalIndex of subtype {subtype} with ' 'values of dtype {other}') if not is_dtype_equal(subtype, key_dtype): raise ValueError(msg.format(subtype=subtype, other=key_dtype)) return key_i8
def test_setitem(self): df = DataFrame({'A': range(10)}) s = pd.cut(df.A, 5) assert isinstance(s.cat.categories, IntervalIndex) # B & D end up as Categoricals # the remainer are converted to in-line objects # contining an IntervalIndex.values df['B'] = s df['C'] = np.array(s) df['D'] = s.values df['E'] = np.array(s.values) assert is_categorical_dtype(df['B']) assert is_interval_dtype(df['B'].cat.categories) assert is_categorical_dtype(df['D']) assert is_interval_dtype(df['D'].cat.categories) assert is_object_dtype(df['C']) assert is_object_dtype(df['E']) # they compare equal as Index # when converted to numpy objects c = lambda x: Index(np.array(x)) tm.assert_index_equal(c(df.B), c(df.B), check_names=False) tm.assert_index_equal(c(df.B), c(df.C), check_names=False) tm.assert_index_equal(c(df.B), c(df.D), check_names=False) tm.assert_index_equal(c(df.B), c(df.D), check_names=False) # B & D are the same Series tm.assert_series_equal(df['B'], df['B'], check_names=False) tm.assert_series_equal(df['B'], df['D'], check_names=False) # C & E are the same Series tm.assert_series_equal(df['C'], df['C'], check_names=False) tm.assert_series_equal(df['C'], df['E'], check_names=False)
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or 'right' left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = 'dtype must be an IntervalDtype, got {dtype}' raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ('must not have differing left [{ltype}] and right ' '[{rtype}] types') raise ValueError(msg.format(ltype=type(left).__name__, rtype=type(right).__name__)) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalArray') raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = 'Period dtypes are not supported, use a PeriodIndex instead' raise ValueError(msg) elif (isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz)): msg = ("left and right must have the same time zone, got " "'{left_tz}' and '{right_tz}'") raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def equals(self, other): if self.is_(other): return True # if we can coerce to an II # then we can compare if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False other = Index(getattr(other, '.values', other)) return (self.left.equals(other.left) and self.right.equals(other.right) and self.closed == other.closed)
def equals(self, other): """ Determines if two IntervalIndex objects contain the same elements """ if self.is_(other): return True # if we can coerce to an II # then we can compare if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False other = Index(getattr(other, '.values', other)) return (self.left.equals(other.left) and self.right.equals(other.right) and self.closed == other.closed)
def __setitem__(self, key, value): # na value: need special casing to set directly on numpy arrays needs_float_conversion = False if is_scalar(value) and isna(value): if is_integer_dtype(self.dtype.subtype): # can't set NaN on a numpy integer array needs_float_conversion = True elif is_datetime64_any_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array value = np.datetime64('NaT') elif is_timedelta64_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array value = np.timedelta64('NaT') value_left, value_right = value, value # scalar interval elif is_interval_dtype(value) or isinstance(value, ABCInterval): self._check_closed_matches(value, name="value") value_left, value_right = value.left, value.right else: # list-like of intervals try: array = IntervalArray(value) value_left, value_right = array.left, array.right except TypeError: # wrong type: not interval or NA msg = "'value' should be an interval type, got {} instead." raise TypeError(msg.format(type(value))) # Need to ensure that left and right are updated atomically, so we're # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) if needs_float_conversion: left = left.astype('float') left.values[key] = value_left self._left = left right = self.right.copy(deep=True) if needs_float_conversion: right = right.astype('float') right.values[key] = value_right self._right = right
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if is_categorical_dtype(dtype): typ = 'category' elif is_sparse(arr): typ = 'sparse' elif isinstance(arr, ABCRangeIndex): typ = 'range' elif is_datetimetz(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): typ = 'timedelta' elif is_object_dtype(dtype): typ = 'object' elif is_bool_dtype(dtype): typ = 'bool' elif is_period_dtype(dtype): typ = str(arr.dtype) elif is_interval_dtype(dtype): typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) return typs
def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): if dtype == self.dtype: return self.copy() if copy else self # need to cast to different subtype try: new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) except TypeError: msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' 'incompatible') raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
def _needs_i8_conversion(self, key): """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An Interval-like requires conversion if it's endpoints are one of the aforementioned types. Assumes that any list-like data has already been cast to an Index. Parameters ---------- key : scalar or Index-like The key that should be checked for i8 conversion Returns ------- boolean """ if is_interval_dtype(key) or isinstance(key, Interval): return self._needs_i8_conversion(key.left) i8_types = (Timestamp, Timedelta, DatetimeIndex, TimedeltaIndex) return isinstance(key, i8_types)
def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) answers = {k: a for (kk, a) in answerdict.items() for k in product(*kk)} ns = series.convert_dtypes(*params) expected_dtype = answers[tuple(params)] expected = pd.Series(series.values, dtype=expected_dtype) tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): ns[ns.notna()] = np.nan else: ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy)
def test_convert_dtypes(self, data, maindtype, params, expected_default, expected_other): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) result = series.convert_dtypes(*params) param_names = [ "infer_objects", "convert_string", "convert_integer", "convert_boolean", ] params_dict = dict(zip(param_names, params)) expected_dtype = expected_default for (key, val), dtype in expected_other.items(): if params_dict[key] is val: expected_dtype = dtype expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) # Test that it is a copy copy = series.copy(deep=True) if is_interval_dtype( result.dtype) and result.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): result[result.notna()] = np.nan else: result[result.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy)
def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): if isinstance(data, ABCSeries) and is_interval_dtype(data): data = data.values if isinstance(data, (cls, ABCIntervalIndex)): left = data.left right = data.right closed = closed or data.closed else: # don't allow scalars if is_scalar(data): msg = (f"{cls.__name__}(...) must be called with a collection " f"of some kind, {data} was passed") raise TypeError(msg) # might need to convert empty or purely na data data = maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds( data, validate_closed=closed is None) closed = closed or infer_closed return cls._simple_new( left, right, closed, copy=copy, dtype=dtype, verify_integrity=verify_integrity, )
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex.from_intervals(np.array(self)) return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
def test_construction_generic(self, subtype): # generic i = IntervalDtype(subtype) assert i.subtype is None assert is_interval_dtype(i)
def test_construction(self, subtype): i = IntervalDtype(subtype) assert i.subtype == np.dtype('int64') assert is_interval_dtype(i)
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas import testing as tm >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if (isinstance(left.dtype, CategoricalDtype) and isinstance(right.dtype, CategoricalDtype) and not check_categorical): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype( right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric if isinstance(left_values, ExtensionArray) and isinstance( right_values, ExtensionArray): assert_extension_array_equal( left_values, right_values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: assert_numpy_array_equal( left_values, right_values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and (needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = (f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}.") raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif isinstance(left.dtype, CategoricalDtype) or isinstance( right.dtype, CategoricalDtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype( right.dtype): assert_extension_array_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype) or is_extension_array_dtype_and_needs_i8_conversion( right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if isinstance(left.dtype, CategoricalDtype) or isinstance( right.dtype, CategoricalDtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
def astype(self, dtype, copy=True): with rewrite_exception('IntervalArray', self.__class__.__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) return super(IntervalIndex, self).astype(dtype, copy=copy)
def test_construction(self, subtype): i = IntervalDtype(subtype) assert i.subtype == np.dtype('int64') assert is_interval_dtype(i)
def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. .. versionadded:: 1.5.0 Parameters ---------- df : DataFrame The dataframe to be written to ORC. Raises NotImplementedError if dtype of one or more columns is category, unsigned integers, intervals, periods or sparse. path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. engine : str, default 'pyarrow' ORC library to use. Pyarrow must be >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, similar to ``infer`` the dataframe's index(es) will be saved. However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. engine_kwargs : dict[str, Any] or None, default None Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns ------- bytes if no path argument is provided else None Raises ------ NotImplementedError Dtype of one or more columns is category, unsigned integers, interval, period or sparse. ValueError engine is not pyarrow. Notes ----- * Before using this function you should read the :ref:`user guide about ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`. * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ library. * For supported dtypes please refer to `supported ORC features in Arrow <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. """ if index is None: index = df.index.names[0] is not None if engine_kwargs is None: engine_kwargs = {} # If unsupported dtypes are found raise NotImplementedError # In Pyarrow 9.0.0 this check will no longer be needed for dtype in df.dtypes: if (is_categorical_dtype(dtype) or is_interval_dtype(dtype) or is_period_dtype(dtype) or is_unsigned_integer_dtype(dtype)): raise NotImplementedError( "The dtype of one or more columns is not supported yet.") if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") orc = import_optional_dependency("pyarrow.orc") was_none = path is None if was_none: path = io.BytesIO() assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: assert isinstance(engine, ModuleType) # For mypy try: orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **engine_kwargs, ) except TypeError as e: raise NotImplementedError( "The dtype of one or more columns is not supported yet." ) from e if was_none: assert isinstance(path, io.BytesIO) # For mypy return path.getvalue() return None
def astype(self, dtype, copy=True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) if is_interval_dtype(new_values.dtype): return self._shallow_copy(new_values) return Index.astype(self, dtype, copy=copy)
def _maybe_convert_i8(self, key): """ Maybe convert a given key to its equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. Parameters ---------- key : scalar or list-like The key that should maybe be converted to i8. Returns ------- scalar or list-like The original key if no conversion occurred, int if converted scalar, Int64Index if converted list-like. """ original = key if is_list_like(key): key = ensure_index(key) if not self._needs_i8_conversion(key): return original scalar = is_scalar(key) if is_interval_dtype(key) or isinstance(key, Interval): # convert left/right and reconstruct left = self._maybe_convert_i8(key.left) right = self._maybe_convert_i8(key.right) constructor = Interval if scalar else IntervalIndex.from_arrays # error: "object" not callable return constructor(left, right, closed=self.closed) # type: ignore[operator] if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) if lib.is_period(key): key_i8 = key.ordinal elif isinstance(key_i8, Timestamp): key_i8 = key_i8.value elif isinstance(key_i8, (np.datetime64, np.timedelta64)): key_i8 = key_i8.view("i8") else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: # convert NaT from its i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) # ensure consistency with IntervalIndex subtype # error: Item "ExtensionDtype"/"dtype[Any]" of "Union[dtype[Any], # ExtensionDtype]" has no attribute "subtype" subtype = self.dtype.subtype # type: ignore[union-attr] if not is_dtype_equal(subtype, key_dtype): raise ValueError( f"Cannot index an IntervalIndex of subtype {subtype} with " f"values of dtype {key_dtype}") return key_i8
def astype(self, dtype, copy=True): with rewrite_exception('IntervalArray', self.__class__.__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) return super().astype(dtype, copy=copy)
def _cmp_method(self, other, op): # ensure pandas array for list-like and eliminate non-interval scalars if is_list_like(other): if len(self) != len(other): raise ValueError("Lengths must match to compare") other = array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches return invalid_comparison(self, other, op) # determine the dtype of the elements we want to compare if isinstance(other, Interval): other_dtype = pandas_dtype("interval") elif not is_categorical_dtype(other.dtype): other_dtype = other.dtype else: # for categorical defer to categories for dtype other_dtype = other.categories.dtype # extract intervals if we have interval categories with matching closed if is_interval_dtype(other_dtype): if self.closed != other.categories.closed: return invalid_comparison(self, other, op) other = other.categories.take( other.codes, allow_fill=True, fill_value=other.categories._na_value) # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): if self.closed != other.closed: return invalid_comparison(self, other, op) elif not isinstance(other, Interval): other = type(self)(other) if op is operator.eq: return (self._left == other.left) & (self._right == other.right) elif op is operator.ne: return (self._left != other.left) | (self._right != other.right) elif op is operator.gt: return (self._left > other.left) | ( (self._left == other.left) & (self._right > other.right)) elif op is operator.ge: return (self == other) | (self > other) elif op is operator.lt: return (self._left < other.left) | ( (self._left == other.left) & (self._right < other.right)) else: # operator.lt return (self == other) | (self < other) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): return invalid_comparison(self, other, op) # object dtype -> iteratively check for intervals result = np.zeros(len(self), dtype=bool) for i, obj in enumerate(other): try: result[i] = op(self[i], obj) except TypeError: if obj is NA: # comparison with np.nan returns NA # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 result[i] = op is operator.ne else: raise return result
def test_construction(self, subtype): i = IntervalDtype(subtype, closed="right") assert i.subtype == np.dtype("int64") assert is_interval_dtype(i)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex.from_intervals(np.array(self)) return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): return self.copy() if copy else self return super(IntervalIndex, self).astype(dtype, copy=copy)
def test_construction_generic(self, subtype): # generic i = IntervalDtype(subtype) assert i.subtype is None assert is_interval_dtype(i)
def test_construction_string_regex(self, subtype): i = IntervalDtype(subtype=subtype) assert i.subtype == np.dtype("int64") assert i.inclusive == "right" assert is_interval_dtype(i)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): return self.copy() if copy else self return super(IntervalIndex, self).astype(dtype, copy=copy)
def _simple_new( cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True ): result = IntervalMixin.__new__(cls) closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ( f"must not have differing left [{type(left).__name__}] and " f"right [{type(right).__name__}] types" ) raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ( "category, object, and string subtypes are not supported " "for IntervalArray" ) raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ( "left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'" ) raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array left = maybe_upcast_datetimelike_array(left) left = extract_array(left, extract_numpy=True) right = maybe_upcast_datetimelike_array(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base rbase = getattr(right, "_ndarray", right).base if lbase is not None and lbase is rbase: # If these share data, then setitem could corrupt our IA right = right.copy() result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result