def __init__( self, objs, axis=0, join: str = "outer", keys=None, levels=None, names=None, ignore_index: bool = False, verify_integrity: bool = False, copy: bool = True, sort=False, ): if isinstance(objs, (NDFrame, str)): raise TypeError( "first argument must be an iterable of pandas " f'objects, you passed an object of type "{type(objs).__name__}"' ) if join == "outer": self.intersect = False elif join == "inner": self.intersect = True else: # pragma: no cover raise ValueError( "Only can inner (intersect) or outer (union) join the other axis" ) if isinstance(objs, abc.Mapping): if keys is None: keys = list(objs.keys()) objs = [objs[k] for k in keys] else: objs = list(objs) if len(objs) == 0: raise ValueError("No objects to concatenate") if keys is None: objs = list(com.not_none(*objs)) else: # #1649 clean_keys = [] clean_objs = [] for k, v in zip(keys, objs): if v is None: continue clean_keys.append(k) clean_objs.append(v) objs = clean_objs name = getattr(keys, "name", None) keys = Index(clean_keys, name=name) if len(objs) == 0: raise ValueError("All objects passed were None") # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, (Series, DataFrame)): msg = ( f"cannot concatenate object of type '{type(obj)}'; " "only Series and DataFrame objs are valid" ) raise TypeError(msg) # consolidate obj._consolidate(inplace=True) ndims.add(obj.ndim) # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty sample = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: if obj.ndim == max_ndim and np.sum(obj.shape): sample = obj break else: # filter out the empties if we have not multi-index possibilities # note to keep empty Series as it affect to result columns / name non_empties = [ obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) ] if len(non_empties) and ( keys is None and names is None and levels is None and not self.intersect ): objs = non_empties sample = objs[0] if sample is None: sample = objs[0] self.objs = objs # Standardize axis parameter to int if isinstance(sample, Series): axis = DataFrame._get_axis_number(axis) else: axis = sample._get_axis_number(axis) # Need to flip BlockManager axis in the DataFrame special case self._is_frame = isinstance(sample, ABCDataFrame) if self._is_frame: axis = 1 if axis == 0 else 0 self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: raise AssertionError( f"axis must be between 0 and {sample.ndim}, input was {axis}" ) # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed if len(ndims) > 1: current_column = 0 max_ndim = sample.ndim self.objs, objs = [], self.objs for obj in objs: ndim = obj.ndim if ndim == max_ndim: pass elif ndim != max_ndim - 1: raise ValueError( "cannot concatenate unaligned mixed " "dimensional NDFrame objects" ) else: name = getattr(obj, "name", None) if ignore_index or name is None: name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: name = 0 obj = sample._constructor({name: obj}) self.objs.append(obj) # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis self.keys = keys self.names = names or getattr(keys, "names", None) self.levels = levels self.sort = sort self.ignore_index = ignore_index self.verify_integrity = verify_integrity self.copy = copy self.new_axes = self._get_new_axes()
tm.makeTimeDataFrame(), tm.makeTimeSeries(), Series(tm.makePeriodIndex()), Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), ], ) def test_hash_pandas_object_diff_index_non_empty(obj): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) assert not (a == b).all() @pytest.mark.parametrize( "obj", [ Index([1, 2, 3]), Index([True, False, True]), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), MultiIndex.from_product([ range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2) ]), MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]), ], ) def test_hash_pandas_index(obj, index): a = hash_pandas_object(obj, index=index) b = hash_pandas_object(obj, index=index) tm.assert_series_equal(a, b)
def test_insert(self): idx = DatetimeIndex( ['2000-01-04', '2000-01-01', '2000-01-02'], name='idx') result = idx.insert(2, datetime(2000, 1, 5)) exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05', '2000-01-02'], name='idx') tm.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index result = idx.insert(1, 'inserted') expected = Index([datetime(2000, 1, 4), 'inserted', datetime(2000, 1, 1), datetime(2000, 1, 2)], name='idx') assert not isinstance(result, DatetimeIndex) tm.assert_index_equal(result, expected) assert result.name == expected.name idx = date_range('1/1/2000', periods=3, freq='M', name='idx') # preserve freq expected_0 = DatetimeIndex(['1999-12-31', '2000-01-31', '2000-02-29', '2000-03-31'], name='idx', freq='M') expected_3 = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30'], name='idx', freq='M') # reset freq to None expected_1_nofreq = DatetimeIndex(['2000-01-31', '2000-01-31', '2000-02-29', '2000-03-31'], name='idx', freq=None) expected_3_nofreq = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-01-02'], name='idx', freq=None) cases = [(0, datetime(1999, 12, 31), expected_0), (-3, datetime(1999, 12, 31), expected_0), (3, datetime(2000, 4, 30), expected_3), (1, datetime(2000, 1, 31), expected_1_nofreq), (3, datetime(2000, 1, 2), expected_3_nofreq)] for n, d, expected in cases: result = idx.insert(n, d) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq # reset freq to None result = idx.insert(3, datetime(2000, 1, 2)) expected = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-01-02'], name='idx', freq=None) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq is None # see gh-7299 idx = date_range('1/1/2000', periods=3, freq='D', tz='Asia/Tokyo', name='idx') with pytest.raises(ValueError): idx.insert(3, pd.Timestamp('2000-01-04')) with pytest.raises(ValueError): idx.insert(3, datetime(2000, 1, 4)) with pytest.raises(ValueError): idx.insert(3, pd.Timestamp('2000-01-04', tz='US/Eastern')) with pytest.raises(ValueError): idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone('US/Eastern'))) for tz in ['US/Pacific', 'Asia/Singapore']: idx = date_range('1/1/2000 09:00', periods=6, freq='H', tz=tz, name='idx') # preserve freq expected = date_range('1/1/2000 09:00', periods=7, freq='H', tz=tz, name='idx') for d in [pd.Timestamp('2000-01-01 15:00', tz=tz), pytz.timezone(tz).localize(datetime(2000, 1, 1, 15))]: result = idx.insert(6, d) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq assert result.tz == expected.tz expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 10:00', '2000-01-01 11:00', '2000-01-01 12:00', '2000-01-01 13:00', '2000-01-01 14:00', '2000-01-01 10:00'], name='idx', tz=tz, freq=None) # reset freq to None for d in [pd.Timestamp('2000-01-01 10:00', tz=tz), pytz.timezone(tz).localize(datetime(2000, 1, 1, 10))]: result = idx.insert(6, d) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.tz == expected.tz assert result.freq is None
def test_nanops(self): # GH#7261 for opname in ['max', 'min']: for klass in [Index, Series]: arg_op = 'arg' + opname if klass is Index else 'idx' + opname obj = klass([np.nan, 2.0]) assert getattr(obj, opname)() == 2.0 obj = klass([np.nan]) assert pd.isna(getattr(obj, opname)()) assert pd.isna(getattr(obj, opname)(skipna=False)) obj = klass([]) assert pd.isna(getattr(obj, opname)()) assert pd.isna(getattr(obj, opname)(skipna=False)) obj = klass([pd.NaT, datetime(2011, 11, 1)]) # check DatetimeIndex monotonic path assert getattr(obj, opname)() == datetime(2011, 11, 1) assert getattr(obj, opname)(skipna=False) is pd.NaT assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: assert np.isnan(result) else: assert result == -1 obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) # check DatetimeIndex non-monotonic path assert getattr(obj, opname)(), datetime(2011, 11, 1) assert getattr(obj, opname)(skipna=False) is pd.NaT assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: assert np.isnan(result) else: assert result == -1 for dtype in ["M8[ns]", "datetime64[ns, UTC]"]: # cases with empty Series/DatetimeIndex obj = klass([], dtype=dtype) assert getattr(obj, opname)() is pd.NaT assert getattr(obj, opname)(skipna=False) is pd.NaT with pytest.raises(ValueError, match="empty sequence"): getattr(obj, arg_op)() with pytest.raises(ValueError, match="empty sequence"): getattr(obj, arg_op)(skipna=False) # argmin/max obj = Index(np.arange(5, dtype='int64')) assert obj.argmin() == 0 assert obj.argmax() == 4 obj = Index([np.nan, 1, np.nan, 2]) assert obj.argmin() == 1 assert obj.argmax() == 3 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([np.nan]) assert obj.argmin() == -1 assert obj.argmax() == -1 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT]) assert obj.argmin() == 1 assert obj.argmax() == 2 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([pd.NaT]) assert obj.argmin() == -1 assert obj.argmax() == -1 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1
def test_align_float(self, float_frame): af, bf = float_frame.align(float_frame) assert af._mgr is not float_frame._mgr af, bf = float_frame.align(float_frame, copy=False) assert af._mgr is float_frame._mgr # axis = 0 other = float_frame.iloc[:-5, :3] af, bf = float_frame.align(other, axis=0, fill_value=-1) tm.assert_index_equal(bf.columns, other.columns) # test fill value join_idx = float_frame.index.join(other.index) diff_a = float_frame.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values assert (diff_a_vals == -1).all() af, bf = float_frame.align(other, join="right", axis=0) tm.assert_index_equal(bf.columns, other.columns) tm.assert_index_equal(bf.index, other.index) tm.assert_index_equal(af.index, other.index) # axis = 1 other = float_frame.iloc[:-5, :3].copy() af, bf = float_frame.align(other, axis=1) tm.assert_index_equal(bf.columns, float_frame.columns) tm.assert_index_equal(bf.index, other.index) # test fill value join_idx = float_frame.index.join(other.index) diff_a = float_frame.index.difference(join_idx) diff_a_vals = af.reindex(diff_a).values assert (diff_a_vals == -1).all() af, bf = float_frame.align(other, join="inner", axis=1) tm.assert_index_equal(bf.columns, other.columns) af, bf = float_frame.align(other, join="inner", axis=1, method="pad") tm.assert_index_equal(bf.columns, other.columns) af, bf = float_frame.align(other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None) tm.assert_index_equal(bf.index, Index([])) af, bf = float_frame.align(other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0) tm.assert_index_equal(bf.index, Index([])) # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): float_frame.align(af.iloc[0, :3], join="inner", axis=2) # align dataframe to series with broadcast or not idx = float_frame.index s = Series(range(len(idx)), index=idx) left, right = float_frame.align(s, axis=0) tm.assert_index_equal(left.index, float_frame.index) tm.assert_index_equal(right.index, float_frame.index) assert isinstance(right, Series) left, right = float_frame.align(s, broadcast_axis=1) tm.assert_index_equal(left.index, float_frame.index) expected = {c: s for c in float_frame.columns} expected = DataFrame(expected, index=float_frame.index, columns=float_frame.columns) tm.assert_frame_equal(right, expected) # see gh-9558 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = df[df["a"] == 2] expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) tm.assert_frame_equal(result, expected) result = df.where(df["a"] == 2, 0) expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected)
def makeUIntIndex(k=10, name=None): return Index([2**63 + i for i in range(k)], name=name)
def makeObjectSeries(name=None): data = makeStringIndex(_N) data = Index(data, dtype=object) index = makeStringIndex(_N) return Series(data, index=index, name=name)
def test_string_datetimelike_compat(self): # GH 6463 expected = infer_freq(['2004-01', '2004-02', '2004-03', '2004-04']) result = infer_freq(Index(['2004-01', '2004-02', '2004-03', '2004-04'])) self.assertEqual(result,expected)
def _convert_listlike_datetimes( arg, format, name=None, tz=None, unit=None, errors=None, infer_datetime_format=None, dayfirst=None, yearfirst=None, exact=None, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be parced name : object None or string for the Index name tz : object None or 'utc' unit : string None or string of the frequency of the passed data errors : string error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' infer_datetime_format : boolean inferring format behavior from to_datetime dayfirst : boolean dayfirst parsing behavior from to_datetime yearfirst : boolean yearfirst parsing behavior from to_datetime exact : boolean exact format matching behavior from to_datetime Returns ------- Index-like of parsed dates """ from pandas import DatetimeIndex from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, objects_to_datetime64ns, ) if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype="O") # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass elif tz: # DatetimeArray, DatetimeIndex return arg.tz_localize(tz) return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, "_values", arg) # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): # Explicitly pass NaT mask to array_with_unit_to_datetime mask = arg.isna() arg = arg._ndarray_values else: mask = None result, tz_parsed = tslib.array_with_unit_to_datetime(arg, mask, unit, errors=errors) if errors == "ignore": from pandas import Index result = Index(result, name=name) else: result = DatetimeIndex(result, name=name) # GH 23758: We may still need to localize the result with tz # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) # result will be naive but in UTC try: result = result.tz_localize("UTC").tz_convert(tz_parsed) except AttributeError: # Regular Index from 'ignore' path return result if tz is not None: if result.tz is None: result = result.tz_localize(tz) else: result = result.tz_convert(tz) return result elif getattr(arg, "ndim", 1) > 1: raise TypeError( "arg must be a string, datetime, list, tuple, 1-d array, or Series" ) # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg arg, _ = maybe_convert_dtype(arg, copy=False) arg = ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None tz_parsed = None result = None if format is not None: try: # shortcut formatting here if format == "%Y%m%d": try: # pass orig_arg as float-dtype may have been converted to # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result, timezones = array_strptime(arg, format, exact=exact, errors=errors) if "%Z" in format or "%z" in format: return _return_parsed_timezone_results( result, timezones, tz, name) except tslibs.OutOfBoundsDatetime: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") iresult.fill(tslibs.iNaT) else: result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") iresult.fill(tslibs.iNaT) else: result = arg except ValueError as e: # Fallback to try to convert datetime objects if timezone-aware # datetime objects are found without passing `utc=True` try: values, tz = conversion.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if result is None: assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, errors=errors, require_iso8601=require_iso8601, allow_object=True, ) if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name)
class TestRangeIndexConstructors: @pytest.mark.parametrize("name", [None, "foo"]) @pytest.mark.parametrize( "args, kwargs, start, stop, step", [ ((5, ), {}, 0, 5, 1), ((1, 5), {}, 1, 5, 1), ((1, 5, 2), {}, 1, 5, 2), ((0, ), {}, 0, 0, 1), ((0, 0), {}, 0, 0, 1), ((), { "start": 0 }, 0, 0, 1), ((), { "stop": 0 }, 0, 0, 1), ], ) def test_constructor(self, args, kwargs, start, stop, step, name): result = RangeIndex(*args, name=name, **kwargs) expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) assert isinstance(result, RangeIndex) assert result.name is name assert result._range == range(start, stop, step) tm.assert_index_equal(result, expected, exact="equiv") def test_constructor_invalid_args(self): msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" with pytest.raises(TypeError, match=msg): RangeIndex() with pytest.raises(TypeError, match=msg): RangeIndex(name="Foo") # we don't allow on a bare Index msg = (r"Index\(\.\.\.\) must be called with a collection of some " r"kind, 0 was passed") with pytest.raises(TypeError, match=msg): Index(0) @pytest.mark.parametrize( "args", [ Index(["a", "b"]), Series(["a", "b"]), np.array(["a", "b"]), [], np.arange(0, 10), np.array([1]), [1], ], ) def test_constructor_additional_invalid_args(self, args): msg = f"Value needs to be a scalar value, was type {type(args).__name__}" with pytest.raises(TypeError, match=msg): RangeIndex(args) @pytest.mark.parametrize("args", ["foo", datetime(2000, 1, 1, 0, 0)]) def test_constructor_invalid_args_wrong_type(self, args): msg = f"Wrong type {type(args)} for value {args}" with pytest.raises(TypeError, match=msg): RangeIndex(args) def test_constructor_same(self): # pass thru w and w/o copy index = RangeIndex(1, 5, 2) result = RangeIndex(index, copy=False) assert result.identical(index) result = RangeIndex(index, copy=True) tm.assert_index_equal(result, index, exact=True) result = RangeIndex(index) tm.assert_index_equal(result, index, exact=True) with pytest.raises( ValueError, match= "Incorrect `dtype` passed: expected signed integer, received float64", ): RangeIndex(index, dtype="float64") def test_constructor_range_object(self): result = RangeIndex(range(1, 5, 2)) expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) def test_constructor_range(self): result = RangeIndex.from_range(range(1, 5, 2)) expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) result = RangeIndex.from_range(range(5, 6)) expected = RangeIndex(5, 6, 1) tm.assert_index_equal(result, expected, exact=True) # an invalid range result = RangeIndex.from_range(range(5, 1)) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected, exact=True) result = RangeIndex.from_range(range(5)) expected = RangeIndex(0, 5, 1) tm.assert_index_equal(result, expected, exact=True) result = Index(range(1, 5, 2)) expected = RangeIndex(1, 5, 2) tm.assert_index_equal(result, expected, exact=True) msg = ( r"(RangeIndex.)?from_range\(\) got an unexpected keyword argument( 'copy')?" ) with pytest.raises(TypeError, match=msg): RangeIndex.from_range(range(10), copy=True) def test_constructor_name(self): # GH#12288 orig = RangeIndex(10) orig.name = "original" copy = RangeIndex(orig) copy.name = "copy" assert orig.name == "original" assert copy.name == "copy" new = Index(copy) assert new.name == "copy" new.name = "new" assert orig.name == "original" assert copy.name == "copy" assert new.name == "new" def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) index = RangeIndex(1, 5) assert index.values.dtype == np.int64 with tm.assert_produces_warning(FutureWarning, match="will not infer"): expected = Index(arr).astype("int64") tm.assert_index_equal(index, expected, exact="equiv") # non-int raise Exception with pytest.raises(TypeError, match=r"Wrong type \<class 'str'\>"): RangeIndex("1", "10", "1") with pytest.raises(TypeError, match=r"Wrong type \<class 'float'\>"): RangeIndex(1.1, 10.2, 1.3) # invalid passed type with pytest.raises( ValueError, match= "Incorrect `dtype` passed: expected signed integer, received float64", ): RangeIndex(1, 5, dtype="float64")
def test_intersection(self, sort): # intersect with Int64Index index = RangeIndex(start=0, stop=20, step=2) other = Index(np.arange(1, 6)) result = index.intersection(other, sort=sort) expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(index, sort=sort) expected = Index( np.sort(np.asarray(np.intersect1d(index.values, other.values)))) tm.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) result = index.intersection(other, sort=sort) expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) result = index.intersection(other, sort=sort) expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) # reversed (GH 17296) result = other.intersection(index, sort=sort) tm.assert_index_equal(result, expected) # GH 17296: intersect two decreasing RangeIndexes first = RangeIndex(10, -2, -2) other = RangeIndex(5, -4, -1) expected = first.astype(int).intersection(other.astype(int), sort=sort) result = first.intersection(other, sort=sort).astype(int) tm.assert_index_equal(result, expected) # reversed result = other.intersection(first, sort=sort).astype(int) tm.assert_index_equal(result, expected) index = RangeIndex(5) # intersect of non-overlapping indices other = RangeIndex(5, 10, 1) result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) other = RangeIndex(-1, -5, -1) result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) # intersection of empty indices other = RangeIndex(0, 0, 1) result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) result = other.intersection(index, sort=sort) tm.assert_index_equal(result, expected) # intersection of non-overlapping values based on start value and gcd index = RangeIndex(1, 10, 2) other = RangeIndex(0, 10, 4) result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected)
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( 'C', 4), _mklbl('D', 2)]) df = DataFrame(np.arange(len(ix.get_values())), index=ix) result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] tm.assert_frame_equal(result, expected) expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C2' or c == 'C3')]] result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A', 1), ('A', 2), ('A', 3), ('B', 1)], names=['one', 'two']) columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), ('b', 'foo'), ('b', 'bah')], names=['lvl0', 'lvl1']) df = DataFrame( np.arange(16, dtype='int64').reshape( 4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ['foo'])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ['foo'])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc['A', 'a'] expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name='two'), columns=Index(['bar', 'foo'], name='lvl1')) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.get_values())), index=ix) result = s.loc['A1':'A3', :, ['C1', 'C3']] expected = s.loc[[tuple([a, b, c, d]) for a, b, c, d in s.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) def f(): df.loc[(slice(None), np.array([True, False])), :] self.assertRaises(ValueError, f) # ambiguous cases # these can be multiply interpreted (e.g. in this case # as df.loc[slice(None),[1]] as well self.assertRaises(KeyError, lambda: df.loc[slice(None), [1]]) result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted self.assertEqual(df.index.lexsort_depth, 2) df = df.sort_index(level=1, axis=0) self.assertEqual(df.index.lexsort_depth, 0) with tm.assertRaisesRegexp( UnsortedIndexError, 'MultiIndex Slicing requires the index to be fully ' r'lexsorted tuple len \(2\), lexsort depth \(0\)'): df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :]
def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) arr = np.random.randn(len(index), 1) df = DataFrame(arr, index=index, columns=['val']) result = df.val['D'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['A'] self.assertRaises(KeyError, f) def f(): df.val['X'] self.assertRaises(KeyError, f) # A is treated as a special Timestamp index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) df = DataFrame(arr, index=index, columns=['val']) result = df.val['A'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['X'] self.assertRaises(KeyError, f) # GH 7866 # multi-index slicing with missing indexers idx = pd.MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) s = pd.Series(np.arange(9, dtype='int64'), index=idx).sort_index() exp_idx = pd.MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = pd.Series(np.arange(3, dtype='int64'), index=exp_idx).sort_index() result = s.loc[['A']] tm.assert_series_equal(result, expected) result = s.loc[['A', 'D']] tm.assert_series_equal(result, expected) # not any values found self.assertRaises(KeyError, lambda: s.loc[['D']]) # empty ok result = s.loc[[]] expected = s.iloc[[]] tm.assert_series_equal(result, expected) idx = pd.IndexSlice expected = pd.Series([0, 3, 6], index=pd.MultiIndex.from_product( [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() result = s.loc[idx[:, ['foo']]] tm.assert_series_equal(result, expected) result = s.loc[idx[:, ['foo', 'bah']]] tm.assert_series_equal(result, expected) # GH 8737 # empty indexer multi_index = pd.MultiIndex.from_product((['foo', 'bar', 'baz'], ['alpha', 'beta'])) df = DataFrame( np.random.randn(5, 6), index=range(5), columns=multi_index) df = df.sort_index(level=0, axis=1) expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) result1 = df.loc[:, ([], slice(None))] result2 = df.loc[:, (['foo'], [])] tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) # regression from < 0.14.0 # GH 7914 df = DataFrame([[np.mean, np.median], ['mean', 'median']], columns=MultiIndex.from_tuples([('functs', 'mean'), ('functs', 'median')]), index=['function', 'name']) result = df.loc['function', ('functs', 'mean')] self.assertEqual(result, np.mean)
from pandas import ( Index, Interval, IntervalIndex, Timedelta, Timestamp, date_range, timedelta_range, ) import pandas._testing as tm from pandas.core.arrays import IntervalArray @pytest.fixture( params=[ (Index([0, 2, 4]), Index([1, 3, 5])), (Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])), (timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)), (date_range("20170101", periods=3), date_range("20170102", periods=3)), ( date_range("20170101", periods=3, tz="US/Eastern"), date_range("20170102", periods=3, tz="US/Eastern"), ), ], ids=lambda x: str(x[0].dtype), ) def left_right_dtypes(request): """ Fixture for building an IntervalArray from various dtypes """
def makeBoolIndex(k=10, name=None): if k == 1: return Index([True], name=name) elif k == 2: return Index([False, True], name=name) return Index([False, True] + [False] * (k - 2), name=name)
def hash_array( vals, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, ): """ Given a 1d array, return an array of deterministic integers. Parameters ---------- vals : ndarray, Categorical encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals
def makeIntIndex(k=10, name=None): return Index(list(range(k)), name=name)
def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples() expected = Index(com.asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected)
def makeFloatIndex(k=10, name=None): values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) return Index(values * (10**np.random.randint(0, 9)), name=name)
class TestSeriesMisc(TestData, SharedWithSparse): series_klass = Series # SharedWithSparse tests use generic, series_klass-agnostic assertion _assert_series_equal = staticmethod(tm.assert_series_equal) def test_tab_completion(self): # GH 9910 s = Series(list('abcd')) # Series of str values should have .str but not .dt/.cat in __dir__ assert 'str' in dir(s) assert 'dt' not in dir(s) assert 'cat' not in dir(s) # similarly for .dt s = Series(date_range('1/1/2015', periods=5)) assert 'dt' in dir(s) assert 'str' not in dir(s) assert 'cat' not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. s = Series(list('abbcd'), dtype="category") assert 'cat' in dir(s) assert 'str' in dir(s) # as it is a string categorical assert 'dt' not in dir(s) # similar to cat and str s = Series(date_range('1/1/2015', periods=5)).astype("category") assert 'cat' in dir(s) assert 'str' not in dir(s) assert 'dt' in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display ok_for_cat = [ 'name', 'index', 'categorical', 'categories', 'codes', 'ordered', 'set_categories', 'add_categories', 'remove_categories', 'rename_categories', 'reorder_categories', 'remove_unused_categories', 'as_ordered', 'as_unordered' ] def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith('_')] return list(sorted(set(results))) s = Series(list('aabbcde')).astype('category') results = get_dir(s) tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) @pytest.mark.parametrize("index", [ tm.makeUnicodeIndex(10), tm.makeStringIndex(10), tm.makeCategoricalIndex(10), Index(['foo', 'bar', 'baz'] * 2), tm.makeDateIndex(10), tm.makePeriodIndex(10), tm.makeTimedeltaIndex(10), tm.makeIntIndex(10), tm.makeUIntIndex(10), tm.makeIntIndex(10), tm.makeFloatIndex(10), Index([True, False]), Index(['a{}'.format(i) for i in range(101)]), pd.MultiIndex.from_tuples(zip('ABCD', 'EFGH')), pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], 'EFGH')), ]) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = pd.Series(index=index) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: assert (not isinstance(x, str) or not x.isidentifier() or x in dir_s) else: assert x not in dir_s def test_not_hashable(self): s_empty = Series() s = Series([1]) msg = "'Series' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): hash(s_empty) with pytest.raises(TypeError, match=msg): hash(s) def test_contains(self): tm.assert_contains_all(self.ts.index, self.ts) def test_iter(self): for i, val in enumerate(self.series): assert val == self.series[i] for i, val in enumerate(self.ts): assert val == self.ts[i] def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() getkeys = self.ts.keys assert getkeys() is self.ts.index def test_values(self): tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False) def test_iteritems(self): for idx, val in self.series.items(): assert val == self.series[idx] for idx, val in self.ts.items(): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.iteritems(), 'reverse') def test_items(self): for idx, val in self.series.items(): assert val == self.series[idx] for idx, val in self.ts.items(): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.items(), 'reverse') def test_raise_on_info(self): s = Series(np.random.randn(10)) msg = "'Series' object has no attribute 'info'" with pytest.raises(AttributeError, match=msg): s.info() def test_copy(self): for deep in [None, False, True]: s = Series(np.arange(10), dtype='float64') # default deep is True if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[::2] = np.NaN if deep is None or deep is True: # Did not modify original Series assert np.isnan(s2[0]) assert not np.isnan(s[0]) else: # we DID modify the original Series assert np.isnan(s2[0]) assert np.isnan(s[0]) # GH 11794 # copy of tz-aware expected = Series([Timestamp('2012/01/01', tz='UTC')]) expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) for deep in [None, False, True]: s = Series([Timestamp('2012/01/01', tz='UTC')]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[0] = pd.Timestamp('1999/01/01', tz='UTC') # default deep is True if deep is None or deep is True: # Did not modify original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected) else: # we DID modify the original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected2) def test_axis_alias(self): s = Series([1, 2, np.nan]) assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index')) assert s.dropna().sum('rows') == 3 assert s._get_axis_number('rows') == 0 assert s._get_axis_name('rows') == 'index' def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 # no exception and no empty docstring assert pydoc.getdoc(Series.index) def test_numpy_unique(self): # it works! np.unique(self.ts) def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], index=date_range('1/1/2000', periods=1000)) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # .item() s = Series([1]) result = s.item() assert result == 1 assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype='float64') tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F')) # compress # GH 6658 s = Series([0, 1., -1], index=list('abc')) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.], index=['b'])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original exp = Series([], dtype='float64', index=Index([], dtype='object')) tm.assert_series_equal(result, exp) s = Series([0, 1., -1], index=[.1, .2, .3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.], index=[.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original exp = Series([], dtype='float64', index=Index([], dtype='float64')) tm.assert_series_equal(result, exp) def test_str_accessor_updates_on_inplace(self): s = pd.Series(list('abc')) s.drop([0], inplace=True) assert len(s.str.lower()) == 2 def test_str_attribute(self): # GH9068 methods = ['strip', 'rstrip', 'lstrip'] s = Series([' jack', 'jill ', ' jesse ', 'frank']) for method in methods: expected = Series([getattr(str, method)(x) for x in s.values]) assert_series_equal(getattr(Series.str, method)(s.str), expected) # str accessor only valid with string values s = Series(range(5)) with pytest.raises(AttributeError, match='only use .str accessor'): s.str.repeat(2) def test_empty_method(self): s_empty = pd.Series() assert s_empty.empty for full_series in [pd.Series([1]), pd.Series(index=[1])]: assert not full_series.empty def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip('IPython', minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter('ignore'): list(ip.Completer.completions('s.', 1)) def test_integer_series_size(self): # GH 25580 s = Series(range(9)) assert s.size == 9 s = Series(range(9), dtype="Int64") assert s.size == 9
def makeCustomIndex(nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None): """ Create an index/multindex with given dimensions, levels, names, etc' nentries - number of entries in index nlevels - number of levels (> 1 produces multindex) prefix - a string prefix for labels names - (Optional), bool or list of strings. if True will use default names, if false will use no names, if a list is given, the name of each level in the index will be taken from the list. ndupe_l - (Optional), list of ints, the number of rows for which the label will repeated at the corresponding level, you can specify just the first few, the rest will use the default ndupe_l of 1. len(ndupe_l) <= nlevels. idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". If idx_type is not None, `idx_nlevels` must be 1. "i"/"f" creates an integer/float index, "s"/"u" creates a string/unicode index "dt" create a datetime index. "td" create a datetime index. if unspecified, string labels will be generated. """ if ndupe_l is None: ndupe_l = [1] * nlevels assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels assert names is None or names is False or names is True or len( names) is nlevels assert idx_type is None or (idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1) if names is True: # build default names names = [prefix + str(i) for i in range(nlevels)] if names is False: # pass None to index constructor for no name names = None # make singleton case uniform if isinstance(names, str) and nlevels == 1: names = [names] # specific 1D index type requested? idx_func = { "i": makeIntIndex, "f": makeFloatIndex, "s": makeStringIndex, "u": makeUnicodeIndex, "dt": makeDateIndex, "td": makeTimedeltaIndex, "p": makePeriodIndex, }.get(idx_type) if idx_func: # error: Cannot call function of unknown type idx = idx_func(nentries) # type: ignore[operator] # but we need to fill in the name if names: idx.name = names[0] return idx elif idx_type is not None: raise ValueError( f"{repr(idx_type)} is not a legal value for `idx_type`, " "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'.") if len(ndupe_l) < nlevels: ndupe_l.extend([1] * (nlevels - len(ndupe_l))) assert len(ndupe_l) == nlevels assert all(x > 0 for x in ndupe_l) list_of_lists = [] for i in range(nlevels): def keyfunc(x): import re numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") return [int(num) for num in numeric_tuple] # build a list of lists to create the index from div_factor = nentries // ndupe_l[i] + 1 # Deprecated since version 3.9: collections.Counter now supports []. See PEP 585 # and Generic Alias Type. cnt: Counter[str] = collections.Counter() for j in range(div_factor): label = f"{prefix}_l{i}_g{j}" cnt[label] = ndupe_l[i] # cute Counter trick result = sorted(cnt.elements(), key=keyfunc)[:nentries] list_of_lists.append(result) tuples = list(zip(*list_of_lists)) # convert tuples to index if nentries == 1: # we have a single level of tuples, i.e. a regular Index index = Index(tuples[0], name=names[0]) elif nlevels == 1: name = None if names is None else names[0] index = Index((x[0] for x in tuples), name=name) else: index = MultiIndex.from_tuples(tuples, names=names) return index
def test_categorical_delegations(self): # invalid accessor msg = r"Can only use \.cat accessor with a 'category' dtype" with pytest.raises(AttributeError, match=msg): Series([1, 2, 3]).cat with pytest.raises(AttributeError, match=msg): Series([1, 2, 3]).cat() with pytest.raises(AttributeError, match=msg): Series(['a', 'b', 'c']).cat with pytest.raises(AttributeError, match=msg): Series(np.arange(5.)).cat with pytest.raises(AttributeError, match=msg): Series([Timestamp('20130101')]).cat # Series should delegate calls to '.categories', '.codes', '.ordered' # and the methods '.set_categories()' 'drop_unused_categories()' to the # categorical s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["a", "b", "c"]) tm.assert_index_equal(s.cat.categories, exp_categories) s.cat.categories = [1, 2, 3] exp_categories = Index([1, 2, 3]) tm.assert_index_equal(s.cat.categories, exp_categories) exp_codes = Series([0, 1, 2, 0], dtype='int8') tm.assert_series_equal(s.cat.codes, exp_codes) assert s.cat.ordered s = s.cat.as_unordered() assert not s.cat.ordered s.cat.as_ordered(inplace=True) assert s.cat.ordered # reorder s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) s = s.cat.set_categories(["c", "b", "a"]) tm.assert_index_equal(s.cat.categories, exp_categories) tm.assert_numpy_array_equal(s.values.__array__(), exp_values) tm.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories s = Series( Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"])) exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() tm.assert_index_equal(s.cat.categories, exp_categories) tm.assert_numpy_array_equal(s.values.__array__(), exp_values) tm.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error # on wrong inputs: msg = "'Series' object has no attribute 'set_categories'" with pytest.raises(AttributeError, match=msg): s.set_categories([4, 3, 2, 1]) # right: s.cat.set_categories([4,3,2,1]) # GH18862 (let Series.cat.rename_categories take callables) s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) result = s.cat.rename_categories(lambda x: x.upper()) expected = Series( Categorical(["A", "B", "C", "A"], categories=["A", "B", "C"], ordered=True)) tm.assert_series_equal(result, expected)
def test_prod_cumprod(self, df, method): expected_columns = Index(["int", "float", "category_int"]) expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric)
def test_excel_old_index_format(self, read_ext): # see gh-4679 filename = "test_index_name_pre17" + read_ext # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. data = np.array( [ [None, None, None, None, None], ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], ] ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( levels=[ ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], ], codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], names=[None, None], ) si = Index( ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None ) expected = pd.DataFrame(data, index=si, columns=columns) actual = pd.read_excel(filename, "single_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi actual = pd.read_excel(filename, "multi_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # The analogous versions of the "names" version data # where there are explicitly no names for the indices. data = np.array( [ ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], ] ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( levels=[ ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], ], codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=[None, None], ) si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) actual = pd.read_excel(filename, "single_no_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False)
def test_date(self): import datetime dates = [datetime.date(2012, 1, x) for x in range(1, 20)] index = Index(dates) self.assertEqual(index.inferred_type, 'date')
def makeStringIndex(k=10, name=None): return Index(rands_array(nchars=10, size=k), name=name)
def test_map_str(self): # GH 31202 index = self.create_index() result = index.map(str) expected = Index([str(x) for x in index], dtype=object) tm.assert_index_equal(result, expected)
def makeUnicodeIndex(k=10, name=None): return Index(randu_array(nchars=10, size=k), name=name)
def obj(self, dtype): i8vals = date_range("2016-01-01", periods=3).asi8 idx = Index(i8vals, dtype=dtype) assert idx.dtype == dtype return Series(idx)
def test_index_cast_datetime64_other_units(self): arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") idx = Index(arr) assert (idx.values == conversion.ensure_datetime64ns(arr)).all()