def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) outer = self.index.join(other, how='outer') outer2 = other.join(self.index, how='outer') expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) inner = self.index.join(other, how='inner') inner2 = other.join(self.index, how='inner') expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) left = self.index.join(other, how='left') tm.assert_index_equal(left, self.index.astype(object)) left2 = other.join(self.index, how='left') tm.assert_index_equal(left2, other) right = self.index.join(other, how='right') tm.assert_index_equal(right, other) right2 = other.join(self.index, how='right') tm.assert_index_equal(right2, self.index.astype(object))
def test_join_left(self): other = Int64Index([7, 12, 25, 1, 2, 5]) other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) self.assertIsNone(lidx) tm.assert_numpy_array_equal(ridx, eridx) # monotonic res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) self.assertIsNone(lidx) tm.assert_numpy_array_equal(ridx, eridx) # non-unique idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx)
def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) outer = self.index.join(other, how='outer') outer2 = other.join(self.index, how='outer') expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18], dtype=object) self.assertTrue(outer.equals(outer2)) self.assertTrue(outer.equals(expected)) inner = self.index.join(other, how='inner') inner2 = other.join(self.index, how='inner') expected = Index([6, 8, 10], dtype=object) self.assertTrue(inner.equals(inner2)) self.assertTrue(inner.equals(expected)) left = self.index.join(other, how='left') self.assertTrue(left.equals(self.index)) left2 = other.join(self.index, how='left') self.assertTrue(left2.equals(other)) right = self.index.join(other, how='right') self.assertTrue(right.equals(other)) right2 = other.join(self.index, how='right') self.assertTrue(right2.equals(self.index))
def test_join_non_int_index(self): other = Index(2**63 + np.array( [1, 5, 7, 10, 20], dtype='uint64'), dtype=object) outer = self.index.join(other, how='outer') outer2 = other.join(self.index, how='outer') expected = Index(2**63 + np.array( [0, 1, 5, 7, 10, 15, 20, 25], dtype='uint64')) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) inner = self.index.join(other, how='inner') inner2 = other.join(self.index, how='inner') expected = Index(2**63 + np.array([10, 20], dtype='uint64')) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) left = self.index.join(other, how='left') tm.assert_index_equal(left, self.index.astype(object)) left2 = other.join(self.index, how='left') tm.assert_index_equal(left2, other) right = self.index.join(other, how='right') tm.assert_index_equal(right, other) right2 = other.join(self.index, how='right') tm.assert_index_equal(right2, self.index.astype(object))
def process_pandas(data, endog_idx=0, exog_idx=None, index_idx=None): names = data.columns if isinstance(endog_idx, (int, long)): endog_name = names[endog_idx] endog = data[endog_name].copy() if exog_idx is None: exog = data.drop([endog_name], axis=1) else: exog = data[names[exog_idx]].copy() else: endog = data.loc[:, endog_idx].copy() endog_name = list(endog.columns) if exog_idx is None: exog = data.drop(endog_name, axis=1) elif isinstance(exog_idx, (int, long)): exog = data[names[exog_idx]].copy() else: exog = data[names[exog_idx]].copy() if index_idx is not None: # NOTE: will have to be improved for dates index = Index(data.iloc[:, index_idx]) endog.index = index exog.index = index.copy() data = data.set_index(names[index_idx]) exog_name = list(exog.columns) dataset = Dataset(data=data, names=list(names), endog=endog, exog=exog, endog_name=endog_name, exog_name=exog_name) return dataset
def test_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] expected_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] # GH#11128 dti = pd.date_range(freq='D', start=datetime(1998, 1, 1), periods=365) english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert dti.weekday_name[day] == eng_name assert dti.day_name(locale=time_locale)[day] == name ts = Timestamp(datetime(2016, 4, day)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert ts.weekday_name == eng_name assert ts.day_name(locale=time_locale) == name dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.day_name(locale=time_locale)[-1]) ts = Timestamp(pd.NaT) assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 dti = pd.date_range(freq='M', start='2012', end='2013') result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) # work around different normalization schemes # https://github.com/pandas-dev/pandas/issues/22342 if not compat.PY2: result = result.str.normalize("NFD") expected = expected.str.normalize("NFD") tm.assert_index_equal(result, expected) for date, expected in zip(dti, expected_months): result = date.month_name(locale=time_locale) expected = expected.capitalize() if not compat.PY2: result = unicodedata.normalize("NFD", result) expected = unicodedata.normalize("NFD", result) assert result == expected dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.month_name(locale=time_locale)[-1])
def test_intersection(self): # intersect with Int64Index other = Index(np.arange(1, 6)) result = self.index.intersection(other) expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) self.assert_index_equal(result, expected) result = other.intersection(self.index) expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, other.values)))) self.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) result = self.index.intersection(other) expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) self.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) result = self.index.intersection(other) expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) self.assert_index_equal(result, expected)
def test_join_right(self): other = Int64Index([7, 12, 25, 1, 2, 5]) other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic res, lidx, ridx = self.index.join(other, how='right', return_indexers=True) eres = other elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) assert isinstance(other, Int64Index) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) assert ridx is None # monotonic res, lidx, ridx = self.index.join(other_mono, how='right', return_indexers=True) eres = other_mono elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) assert isinstance(other, Int64Index) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) assert ridx is None # non-unique idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx)
def _maybe_cache(arg, format, cache, convert_listlike): """ Create a cache of unique dates from an array of dates Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series format : string Strftime format to parse time cache : boolean True attempts to create a cache of converted values convert_listlike : function Conversion function to apply on dates Returns ------- cache_array : Series Cache of converted, unique dates. Can be empty """ from pandas import Series cache_array = Series() if cache: # Perform a quicker unique check from pandas import Index unique_dates = Index(arg).unique() if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates.to_numpy(), True, format) cache_array = Series(cache_dates, index=unique_dates) return cache_array
def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples() expected = Index(com.asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected)
def test_get_loc(self): # GH 12531 cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) idx1 = Index(list("abcde")) assert cidx1.get_loc("a") == idx1.get_loc("a") assert cidx1.get_loc("e") == idx1.get_loc("e") for i in [cidx1, idx1]: with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) idx2 = Index(list("aacded")) # results in bool array res = cidx2.get_loc("d") tm.assert_numpy_array_equal(res, idx2.get_loc("d")) tm.assert_numpy_array_equal( res, np.array([False, False, False, True, False, True])) # unique element results in scalar res = cidx2.get_loc("e") assert res == idx2.get_loc("e") assert res == 4 for i in [cidx2, idx2]: with pytest.raises(KeyError, match="'NOT-EXIST'"): i.get_loc("NOT-EXIST") # non-unique, sliceable cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) idx3 = Index(list("aabbb")) # results in slice res = cidx3.get_loc("a") assert res == idx3.get_loc("a") assert res == slice(0, 2, None) res = cidx3.get_loc("b") assert res == idx3.get_loc("b") assert res == slice(2, 5, None) for i in [cidx3, idx3]: with pytest.raises(KeyError, match="'c'"): i.get_loc("c")
def test_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes expected_days = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", ] expected_months = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", ] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] # GH#11128 dti = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) english_days = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", ] for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert dti.day_name(locale=time_locale)[day] == name ts = Timestamp(datetime(2016, 4, day)) assert ts.day_name(locale=time_locale) == name dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.day_name(locale=time_locale)[-1]) ts = Timestamp(pd.NaT) assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 dti = pd.date_range(freq="M", start="2012", end="2013") result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) # work around different normalization schemes # https://github.com/pandas-dev/pandas/issues/22342 result = result.str.normalize("NFD") expected = expected.str.normalize("NFD") tm.assert_index_equal(result, expected) for date, expected in zip(dti, expected_months): result = date.month_name(locale=time_locale) expected = expected.capitalize() result = unicodedata.normalize("NFD", result) expected = unicodedata.normalize("NFD", result) assert result == expected dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.month_name(locale=time_locale)[-1])
def test_argminmax(self): obj = Index(np.arange(5, dtype="int64")) assert obj.argmin() == 0 assert obj.argmax() == 4 obj = Index([np.nan, 1, np.nan, 2]) assert obj.argmin() == 1 assert obj.argmax() == 3 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([np.nan]) assert obj.argmin() == -1 assert obj.argmax() == -1 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT]) assert obj.argmin() == 1 assert obj.argmax() == 2 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([pd.NaT]) assert obj.argmin() == -1 assert obj.argmax() == -1 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1
def test_slicing_and_getting_ops(self): # systematically test the slicing operations: # for all slicing ops: # - returning a dataframe # - returning a column # - returning a row # - returning a single value cats = Categorical(["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 3, 4, 5, 6, 7] df = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values cats2 = Categorical(["b", "c"], categories=["a", "b", "c"]) idx2 = Index(["j", "k"]) values2 = [3, 4] # 2:4,: | "j":"k",: exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) # :,"cats" | :,0 exp_col = Series(cats, index=idx, name='cats') # "j",: | 2,: exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") # "j","cats | 2,0 exp_val = "b" # iloc # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.iloc[2, :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.iloc[2, 0] assert res_val == exp_val # loc # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.loc["j", "cats"] assert res_val == exp_val # ix # frame # res_df = df.loc["j":"k",[0,1]] # doesn't work? res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.loc["j", df.columns[0]] assert res_val == exp_val # iat res_val = df.iat[2, 0] assert res_val == exp_val # at res_val = df.at["j", "cats"] assert res_val == exp_val # fancy indexing exp_fancy = df.iloc[[2]] res_fancy = df[df["cats"] == "b"] tm.assert_frame_equal(res_fancy, exp_fancy) res_fancy = df[df["values"] == 3] tm.assert_frame_equal(res_fancy, exp_fancy) # get_value res_val = df.at["j", "cats"] assert res_val == exp_val # i : int, slice, or sequence of integers res_row = df.iloc[2] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"]) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"])
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas import testing as tm >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=2, ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if (is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype) and not check_categorical): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype( right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric if isinstance(left_values, ExtensionArray) and isinstance( right_values, ExtensionArray): assert_extension_array_equal( left_values, right_values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: assert_numpy_array_equal( left_values, right_values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and (needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = (f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}.") raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype( right.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype) or is_extension_array_dtype_and_needs_i8_conversion( right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype( right.dtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
assert Index(mi.values).equals(Index(recons.values)) @pytest.mark.parametrize( "obj", [ Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(["a", "b", "c"]), Series(["a", np.nan, "c"]), Series(["a", None, "c"]), Series([True, False, True]), Series(dtype=object), Index([1, 2, 3]), Index([True, False, True]), DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), MultiIndex.from_product( [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)] ), MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]),
def test_excel_old_index_format(self, read_ext): # see gh-4679 filename = "test_index_name_pre17" + read_ext # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. data = np.array( [ [None, None, None, None, None], ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], ] ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( levels=[ ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], ], codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], names=[None, None], ) si = Index( ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None ) expected = pd.DataFrame(data, index=si, columns=columns) actual = pd.read_excel(filename, "single_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi actual = pd.read_excel(filename, "multi_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # The analogous versions of the "names" version data # where there are explicitly no names for the indices. data = np.array( [ ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], ] ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] mi = MultiIndex( levels=[ ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], ], codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=[None, None], ) si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) actual = pd.read_excel(filename, "single_no_names", index_col=0) tm.assert_frame_equal(actual, expected) expected.index = mi actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False)
def test_nanops(self): # GH#7261 for opname in ['max', 'min']: for klass in [Index, Series]: arg_op = 'arg' + opname if klass is Index else 'idx' + opname obj = klass([np.nan, 2.0]) assert getattr(obj, opname)() == 2.0 obj = klass([np.nan]) assert pd.isna(getattr(obj, opname)()) assert pd.isna(getattr(obj, opname)(skipna=False)) obj = klass([]) assert pd.isna(getattr(obj, opname)()) assert pd.isna(getattr(obj, opname)(skipna=False)) obj = klass([pd.NaT, datetime(2011, 11, 1)]) # check DatetimeIndex monotonic path assert getattr(obj, opname)() == datetime(2011, 11, 1) assert getattr(obj, opname)(skipna=False) is pd.NaT assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: assert np.isnan(result) else: assert result == -1 obj = klass([pd.NaT, datetime(2011, 11, 1), pd.NaT]) # check DatetimeIndex non-monotonic path assert getattr(obj, opname)(), datetime(2011, 11, 1) assert getattr(obj, opname)(skipna=False) is pd.NaT assert getattr(obj, arg_op)() == 1 result = getattr(obj, arg_op)(skipna=False) if klass is Series: assert np.isnan(result) else: assert result == -1 for dtype in ["M8[ns]", "datetime64[ns, UTC]"]: # cases with empty Series/DatetimeIndex obj = klass([], dtype=dtype) assert getattr(obj, opname)() is pd.NaT assert getattr(obj, opname)(skipna=False) is pd.NaT with pytest.raises(ValueError, match="empty sequence"): getattr(obj, arg_op)() with pytest.raises(ValueError, match="empty sequence"): getattr(obj, arg_op)(skipna=False) # argmin/max obj = Index(np.arange(5, dtype='int64')) assert obj.argmin() == 0 assert obj.argmax() == 4 obj = Index([np.nan, 1, np.nan, 2]) assert obj.argmin() == 1 assert obj.argmax() == 3 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([np.nan]) assert obj.argmin() == -1 assert obj.argmax() == -1 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT]) assert obj.argmin() == 1 assert obj.argmax() == 2 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 obj = Index([pd.NaT]) assert obj.argmin() == -1 assert obj.argmax() == -1 assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1
def _have_same_elements(idx1: pd.Index, idx2: pd.Index) -> bool: return idx1.sort_values().equals(idx2.sort_values())
def test_astype_object(self, index): result = index.astype(object) expected = Index(index.values, dtype="object") tm.assert_index_equal(result, expected) assert not result.equals(index)
def test_set_columns(self, float_string_frame): cols = Index(np.arange(len(float_string_frame.columns))) float_string_frame.columns = cols with pytest.raises(ValueError, match="Length mismatch"): float_string_frame.columns = cols[::2]
def test_assigning_ops(self): # systematically test the assigning operations: # for all slicing ops: # for value in categories and value not in categories: # - assign a single value -> exp_single_cats_value # - assign a complete row (mixed values) -> exp_single_row # assign multiple rows (mixed values) (-> array) -> exp_multi_row # assign a part of a column with dtype == categorical -> # exp_parts_cats_col # assign a part of a column with dtype != categorical -> # exp_parts_cats_col cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] exp_single_row = DataFrame({ "cats": cats1, "values": values1 }, index=idx1) # changed multiple rows cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] exp_multi_row = DataFrame({ "cats": cats2, "values": values2 }, index=idx2) # changed part of the cats column cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) values3 = [1, 1, 1, 1, 1, 1, 1] exp_parts_cats_col = DataFrame({ "cats": cats3, "values": values3 }, index=idx3) # changed single value in cats col cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) values4 = [1, 1, 1, 1, 1, 1, 1] exp_single_cats_value = DataFrame({ "cats": cats4, "values": values4 }, index=idx4) # iloc # ############### # - assign a single value -> exp_single_cats_value df = orig.copy() df.iloc[2, 0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() df.iloc[df.index == "j", 0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError): df = orig.copy() df.iloc[2, 0] = "c" # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() df.iloc[2, :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set with pytest.raises(ValueError): df = orig.copy() df.iloc[2, :] = ["c", 2] # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() df.iloc[2:4, :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) with pytest.raises(ValueError): df = orig.copy() df.iloc[2:4, :] = [["c", 2], ["c", 2]] # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) with pytest.raises(ValueError): # different values df = orig.copy() df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() df.iloc[2:4, 0] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): df.iloc[2:4, 0] = ["c", "c"] # loc # ############## # - assign a single value -> exp_single_cats_value df = orig.copy() df.loc["j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() df.loc[df.index == "j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError): df = orig.copy() df.loc["j", "cats"] = "c" # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() df.loc["j", :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set with pytest.raises(ValueError): df = orig.copy() df.loc["j", :] = ["c", 2] # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() df.loc["j":"k", :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) with pytest.raises(ValueError): df = orig.copy() df.loc["j":"k", :] = [["c", 2], ["c", 2]] # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b", "c"]) with pytest.raises(ValueError): # different values df = orig.copy() df.loc["j":"k", "cats"] = Categorical(["c", "c"], categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", "cats"] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): df.loc["j":"k", "cats"] = ["c", "c"] # loc # ############## # - assign a single value -> exp_single_cats_value df = orig.copy() df.loc["j", df.columns[0]] = "b" tm.assert_frame_equal(df, exp_single_cats_value) df = orig.copy() df.loc[df.index == "j", df.columns[0]] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError): df = orig.copy() df.loc["j", df.columns[0]] = "c" # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() df.loc["j", :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) # - assign a complete row (mixed values) not in categories set with pytest.raises(ValueError): df = orig.copy() df.loc["j", :] = ["c", 2] # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() df.loc["j":"k", :] = [["b", 2], ["b", 2]] tm.assert_frame_equal(df, exp_multi_row) with pytest.raises(ValueError): df = orig.copy() df.loc["j":"k", :] = [["c", 2], ["c", 2]] # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b", "c"]) with pytest.raises(ValueError): # different values df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical(["c", "c"], categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col df = orig.copy() df.loc["j":"k", df.columns[0]] = ["b", "b"] tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): df.loc["j":"k", df.columns[0]] = ["c", "c"] # iat df = orig.copy() df.iat[2, 0] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError): df = orig.copy() df.iat[2, 0] = "c" # at # - assign a single value -> exp_single_cats_value df = orig.copy() df.at["j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) # - assign a single value not in the current categories set with pytest.raises(ValueError): df = orig.copy() df.at["j", "cats"] = "c" # fancy indexing catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]) idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) exp_fancy = exp_multi_row.copy() exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) df[df["cats"] == "c"] = ["b", 2] # category c is kept in .categories tm.assert_frame_equal(df, exp_fancy) # set_value df = orig.copy() df.at["j", "cats"] = "b" tm.assert_frame_equal(df, exp_single_cats_value) with pytest.raises(ValueError): df = orig.copy() df.at["j", "cats"] = "c" # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp)
def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, ignore_index=False, verify_integrity=False, copy=True): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' '"{name}"'.format(name=type(objs).__name__)) if join == 'outer': self.intersect = False elif join == 'inner': self.intersect = True else: # pragma: no cover raise ValueError('Only can inner (intersect) or outer (union) ' 'join the other axis') if isinstance(objs, dict): if keys is None: keys = sorted(objs) objs = [objs[k] for k in keys] else: objs = list(objs) if len(objs) == 0: raise ValueError('No objects to concatenate') if keys is None: objs = [obj for obj in objs if obj is not None] else: # #1649 clean_keys = [] clean_objs = [] for k, v in zip(keys, objs): if v is None: continue clean_keys.append(k) clean_objs.append(v) objs = clean_objs name = getattr(keys, 'name', None) keys = Index(clean_keys, name=name) if len(objs) == 0: raise ValueError('All objects passed were None') # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, NDFrame): raise TypeError("cannot concatenate a non-NDFrame object") # consolidate obj._consolidate(inplace=True) ndims.add(obj.ndim) # get the sample # want the higest ndim that we have, and must be non-empty # unless all objs are empty sample = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: if obj.ndim == max_ndim and np.sum(obj.shape): sample = obj break else: # filter out the empties if we have not multi-index possibilities # note to keep empty Series as it affect to result columns / name non_empties = [ obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) ] if (len(non_empties) and (keys is None and names is None and levels is None and join_axes is None and not self.intersect)): objs = non_empties sample = objs[0] if sample is None: sample = objs[0] self.objs = objs # Standardize axis parameter to int if isinstance(sample, Series): axis = DataFrame()._get_axis_number(axis) else: axis = sample._get_axis_number(axis) # Need to flip BlockManager axis in the DataFrame special case self._is_frame = isinstance(sample, DataFrame) if self._is_frame: axis = 1 if axis == 0 else 0 self._is_series = isinstance(sample, Series) if not 0 <= axis <= sample.ndim: raise AssertionError("axis must be between 0 and {ndim}, input was" " {axis}".format(ndim=sample.ndim, axis=axis)) # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed if len(ndims) > 1: current_column = 0 max_ndim = sample.ndim self.objs, objs = [], self.objs for obj in objs: ndim = obj.ndim if ndim == max_ndim: pass elif ndim != max_ndim - 1: raise ValueError("cannot concatenate unaligned mixed " "dimensional NDFrame objects") else: name = getattr(obj, 'name', None) if ignore_index or name is None: name = current_column current_column += 1 # doing a row-wise concatenation so need everything # to line up if self._is_frame and axis == 1: name = 0 obj = sample._constructor({name: obj}) self.objs.append(obj) # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis self.join_axes = join_axes self.keys = keys self.names = names or getattr(keys, 'names', None) self.levels = levels self.ignore_index = ignore_index self.verify_integrity = verify_integrity self.copy = copy self.new_axes = self._get_new_axes()
def test_map_str(self, simple_index): # GH 31202 idx = simple_index result = idx.map(str) expected = Index([str(x) for x in idx], dtype=object) tm.assert_index_equal(result, expected)
def test_construction_index_with_mixed_timezones_with_NaT(self): # GH 11488 result = Index([pd.NaT, Timestamp('2011-01-01'), pd.NaT, Timestamp('2011-01-02')], name='idx') exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'), pd.NaT, Timestamp('2011-01-02')], name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNone(result.tz) # same tz results in DatetimeIndex result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), pd.NaT, Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], name='idx') exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-01-02 10:00')], tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNotNone(result.tz) self.assertEqual(result.tz, exp.tz) # same tz results in DatetimeIndex (DST) result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), pd.NaT, Timestamp('2011-08-01 10:00', tz='US/Eastern')], name='idx') exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-08-01 10:00')], tz='US/Eastern', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNotNone(result.tz) self.assertEqual(result.tz, exp.tz) # different tz results in Index(dtype=object) result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], name='idx') exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], dtype='object', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], name='idx') exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], dtype='object', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) # passing tz results in DatetimeIndex result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='Asia/Tokyo', name='idx') exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'), pd.NaT, Timestamp('2011-01-03 00:00')], tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) # all NaT result = Index([pd.NaT, pd.NaT], name='idx') exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNone(result.tz) # all NaT with tz result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNotNone(result.tz) self.assertEqual(result.tz, exp.tz)
def assert_index_equal( left: Index, right: Index, exact: bool | str = "equiv", check_names: bool = True, check_less_precise: bool | int | NoDefault = no_default, check_exact: bool = True, check_categorical: bool = True, check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, obj: str = "Index", ) -> None: """ Check that left and right Index are equal. Parameters ---------- left : Index right : Index exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for Int64Index as well. check_names : bool, default True Whether to check the names attribute. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_exact : bool, default True Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_order : bool, default True Whether to compare the order of index entries as well as their values. If True, both indexes must contain the same elements, in the same order. If False, both indexes must contain the same elements, but in any order. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. Examples -------- >>> from pandas import testing as tm >>> a = pd.Index([1, 2, 3]) >>> b = pd.Index([1, 2, 3]) >>> tm.assert_index_equal(a, b) """ __tracebackhide__ = True def _check_types(left, right, obj="Index") -> None: if not exact: return assert_class_equal(left, right, exact=exact, obj=obj) assert_attr_equal("inferred_type", left, right, obj=obj) # Skip exact dtype checking when `check_categorical` is False if is_categorical_dtype(left.dtype) and is_categorical_dtype( right.dtype): if check_categorical: assert_attr_equal("dtype", left, right, obj=obj) assert_index_equal(left.categories, right.categories, exact=exact) return assert_attr_equal("dtype", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) return unique._shallow_copy(filled, name=index.names[level]) if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=2, ) # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" rtol = atol = _get_tol_from_less_precise( check_less_precise # type: ignore[arg-type] ) # instance validation _check_isinstance(left, right, Index) # class / dtype comparison _check_types(left, right, obj=obj) # level comparison if left.nlevels != right.nlevels: msg1 = f"{obj} levels are different" msg2 = f"{left.nlevels}, {left}" msg3 = f"{right.nlevels}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) # length comparison if len(left) != len(right): msg1 = f"{obj} length are different" msg2 = f"{len(left)}, {left}" msg3 = f"{len(right)}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) # If order doesn't matter then sort the index entries if not check_order: left = Index(safe_sort(left)) right = Index(safe_sort(right)) # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: left = cast(MultiIndex, left) right = cast(MultiIndex, right) for level in range(left.nlevels): # cannot use get_level_values here because it can change dtype llevel = _get_ilevel_values(left, level) rlevel = _get_ilevel_values(right, level) lobj = f"MultiIndex level [{level}]" assert_index_equal( llevel, rlevel, exact=exact, check_names=check_names, check_exact=check_exact, rtol=rtol, atol=atol, obj=lobj, ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): diff = (np.sum((left._values != right._values).astype(int)) * 100.0 / len(left)) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) else: # if we have "equiv", this becomes True exact_bool = bool(exact) _testing.assert_almost_equal( left.values, right.values, rtol=rtol, atol=atol, check_dtype=exact_bool, obj=obj, lobj=left, robj=right, ) # metadata comparison if check_names: assert_attr_equal("names", left, right, obj=obj) if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex): assert_attr_equal("freq", left, right, obj=obj) if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex): assert_interval_array_equal(left._values, right._values) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype( right.dtype): assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
def test_order_compat(self): def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): assert index.freq == expected_index.freq pidx = PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A") # for compatibility check iidx = Index([2011, 2012, 2013], name="idx") for idx in [pidx, iidx]: ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) _check_freq(ordered, idx) ordered = idx.sort_values(ascending=False) tm.assert_index_equal(ordered, idx[::-1]) _check_freq(ordered, idx[::-1]) ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, idx[::-1]) tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) _check_freq(ordered, idx[::-1]) pidx = PeriodIndex( ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" ) pexpected = PeriodIndex( ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" ) # for compatibility check iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") iexpected = Index([2011, 2011, 2012, 2013, 2015], name="idx") for idx, expected in [(pidx, pexpected), (iidx, iexpected)]: ordered = idx.sort_values() tm.assert_index_equal(ordered, expected) _check_freq(ordered, idx) ordered = idx.sort_values(ascending=False) tm.assert_index_equal(ordered, expected[::-1]) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, expected) exp = np.array([0, 4, 3, 1, 2]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) _check_freq(ordered, idx) pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") result = pidx.sort_values() expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") tm.assert_index_equal(result, expected) assert result.freq == "D" result = pidx.sort_values(ascending=False) expected = PeriodIndex(["2013", "2011", "2011", "NaT"], name="pidx", freq="D") tm.assert_index_equal(result, expected) assert result.freq == "D"
class TestSeriesMisc: def test_tab_completion(self): # GH 9910 s = Series(list("abcd")) # Series of str values should have .str but not .dt/.cat in __dir__ assert "str" in dir(s) assert "dt" not in dir(s) assert "cat" not in dir(s) def test_tab_completion_dt(self): # similarly for .dt s = Series(date_range("1/1/2015", periods=5)) assert "dt" in dir(s) assert "str" not in dir(s) assert "cat" not in dir(s) def test_tab_completion_cat(self): # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. s = Series(list("abbcd"), dtype="category") assert "cat" in dir(s) assert "str" in dir(s) # as it is a string categorical assert "dt" not in dir(s) def test_tab_completion_cat_str(self): # similar to cat and str s = Series(date_range("1/1/2015", periods=5)).astype("category") assert "cat" in dir(s) assert "str" not in dir(s) assert "dt" in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display ok_for_cat = [ "categories", "codes", "ordered", "set_categories", "add_categories", "remove_categories", "rename_categories", "reorder_categories", "remove_unused_categories", "as_ordered", "as_unordered", ] s = Series(list("aabbcde")).astype("category") results = sorted({r for r in s.cat.__dir__() if not r.startswith("_")}) tm.assert_almost_equal(results, sorted(set(ok_for_cat))) @pytest.mark.parametrize( "index", [ tm.makeStringIndex(10), tm.makeCategoricalIndex(10), Index(["foo", "bar", "baz"] * 2), tm.makeDateIndex(10), tm.makePeriodIndex(10), tm.makeTimedeltaIndex(10), tm.makeIntIndex(10), tm.makeUIntIndex(10), tm.makeIntIndex(10), tm.makeFloatIndex(10), Index([True, False]), Index([f"a{i}" for i in range(101)]), pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")), ], ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = Series(index=index, dtype=object) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: assert not isinstance(x, str) or not x.isidentifier() or x in dir_s else: assert x not in dir_s @pytest.mark.parametrize("ser", [Series(dtype=object), Series([1])]) def test_not_hashable(self, ser): msg = "unhashable type: 'Series'" with pytest.raises(TypeError, match=msg): hash(ser) def test_contains(self, datetime_series): tm.assert_contains_all(datetime_series.index, datetime_series) def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) assert s.dropna().sum("rows") == 3 assert s._get_axis_number("rows") == 0 assert s._get_axis_name("rows") == "index" def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 # no exception and no empty docstring assert pydoc.getdoc(Series.index) def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame( np.random.randn(1000, 3), columns=["A", "B", "C"], index=date_range("1/1/2000", periods=1000), ) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) def test_ndarray_compat_like_func(self): # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) def test_ndarray_compat_ravel(self): # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) def test_empty_method(self): s_empty = Series(dtype=object) assert s_empty.empty @pytest.mark.parametrize("dtype", ["int64", object]) def test_empty_method_full_series(self, dtype): full_series = Series(index=[1], dtype=dtype) assert not full_series.empty @pytest.mark.parametrize("dtype", [None, "Int64"]) def test_integer_series_size(self, dtype): # GH 25580 s = Series(range(9), dtype=dtype) assert s.size == 9 def test_attrs(self): s = Series([0, 1], name="abc") assert s.attrs == {} s.attrs["version"] = 1 result = s + 1 assert result.attrs == {"version": 1} @skip_if_no("jinja2") def test_inspect_getmembers(self): # GH38782 ser = Series(dtype=object) # TODO(2.0): Change to None once is_monotonic deprecation # is enforced with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): inspect.getmembers(ser) def test_unknown_attribute(self): # GH#9680 tdi = pd.timedelta_range(start=0, periods=10, freq="1s") ser = Series(np.random.normal(size=10), index=tdi) assert "foo" not in ser.__dict__.keys() msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): ser.foo @pytest.mark.parametrize("op", ["year", "day", "second", "weekday"]) def test_datetime_series_no_datelike_attrs(self, op, datetime_series): # GH#7206 msg = f"'Series' object has no attribute '{op}'" with pytest.raises(AttributeError, match=msg): getattr(datetime_series, op) def test_series_datetimelike_attribute_access(self): # attribute access should still work! ser = Series({"year": 2000, "month": 1, "day": 10}) assert ser.year == 2000 assert ser.month == 1 assert ser.day == 10 def test_series_datetimelike_attribute_access_invalid(self): ser = Series({"year": 2000, "month": 1, "day": 10}) msg = "'Series' object has no attribute 'weekday'" with pytest.raises(AttributeError, match=msg): ser.weekday def test_series_iteritems_deprecated(self): ser = Series([1]) with tm.assert_produces_warning(FutureWarning): next(ser.iteritems())
class TestFloat64Index(Numeric): _holder = Float64Index @pytest.fixture( params=[ [1.5, 2, 3, 4, 5], [0.0, 2.5, 5.0, 7.5, 10.0], [5, 4, 3, 2, 1.5], [10.0, 7.5, 5.0, 2.5, 0.0], ], ids=["mixed", "float", "mixed_dec", "float_dec"], ) def index(self, request): return Float64Index(request.param) @pytest.fixture def mixed_index(self): return Float64Index([1.5, 2, 3, 4, 5]) @pytest.fixture def float_index(self): return Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) def create_index(self) -> Float64Index: return Float64Index(np.arange(5, dtype="float64")) def test_repr_roundtrip(self, index): tm.assert_index_equal(eval(repr(index)), index) def check_is_index(self, i): assert isinstance(i, Index) assert not isinstance(i, Float64Index) def check_coerce(self, a, b, is_float_index=True): assert a.equals(b) tm.assert_index_equal(a, b, exact=False) if is_float_index: assert isinstance(b, Float64Index) else: self.check_is_index(b) def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) assert isinstance(index, Float64Index) expected = np.array([1, 2, 3, 4, 5], dtype="float64") tm.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) assert isinstance(index, Float64Index) index = Float64Index([1.0, 2, 3, 4, 5]) assert isinstance(index, Float64Index) index = Float64Index(np.array([1.0, 2, 3, 4, 5])) assert isinstance(index, Float64Index) assert index.dtype == float index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) assert isinstance(index, Float64Index) assert index.dtype == np.float64 index = Float64Index(np.array([1, 2, 3, 4, 5]), dtype=np.float32) assert isinstance(index, Float64Index) assert index.dtype == np.float64 # nan handling result = Float64Index([np.nan, np.nan]) assert pd.isna(result.values).all() result = Float64Index(np.array([np.nan])) assert pd.isna(result.values).all() result = Index(np.array([np.nan])) assert pd.isna(result.values).all() @pytest.mark.parametrize( "index, dtype", [ (pd.Int64Index, "float64"), (pd.UInt64Index, "categorical"), (pd.Float64Index, "datetime64"), (pd.RangeIndex, "float64"), ], ) def test_invalid_dtype(self, index, dtype): # GH 29539 with pytest.raises( ValueError, match=rf"Incorrect `dtype` passed: expected \w+(?: \w+)?, received {dtype}", ): index([1, 2, 3], dtype=dtype) def test_constructor_invalid(self): # invalid msg = ( r"Float64Index\(\.\.\.\) must be called with a collection of " r"some kind, 0\.0 was passed" ) with pytest.raises(TypeError, match=msg): Float64Index(0.0) msg = ( "String dtype not supported, " "you may need to explicitly cast to a numeric type" ) with pytest.raises(TypeError, match=msg): Float64Index(["a", "b", 0.0]) msg = r"float\(\) argument must be a string or a number, not 'Timestamp'" with pytest.raises(TypeError, match=msg): Float64Index([Timestamp("20130101")]) def test_constructor_coerce(self, mixed_index, float_index): self.check_coerce(mixed_index, Index([1.5, 2, 3, 4, 5])) self.check_coerce(float_index, Index(np.arange(5) * 2.5)) self.check_coerce( float_index, Index(np.array(np.arange(5) * 2.5, dtype=object)) ) def test_constructor_explicit(self, mixed_index, float_index): # these don't auto convert self.check_coerce( float_index, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False ) self.check_coerce( mixed_index, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False ) def test_type_coercion_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" with pytest.raises(ValueError, match=msg): Index([1, 2, 3.5], dtype=any_int_dtype) def test_type_coercion_valid(self, float_dtype): # There is no Float32Index, so we always # generate Float64Index. i = Index([1, 2, 3.5], dtype=float_dtype) tm.assert_index_equal(i, Index([1, 2, 3.5])) def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) assert i.equals(i) assert i.identical(i) i2 = Float64Index([1.0, 2.0]) assert i.equals(i2) i = Float64Index([1.0, np.nan]) assert i.equals(i) assert i.identical(i) i2 = Float64Index([1.0, np.nan]) assert i.equals(i2) @pytest.mark.parametrize( "other", ( Int64Index([1, 2]), Index([1.0, 2.0], dtype=object), Index([1, 2], dtype=object), ), ) def test_equals_numeric_other_index_type(self, other): i = Float64Index([1.0, 2.0]) assert i.equals(other) assert other.equals(i) @pytest.mark.parametrize( "vals", [ pd.date_range("2016-01-01", periods=3), pd.timedelta_range("1 Day", periods=3), ], ) def test_lookups_datetimelike_values(self, vals): # If we have datetime64 or timedelta64 values, make sure they are # wrappped correctly GH#31163 ser = pd.Series(vals, index=range(3, 6)) ser.index = ser.index.astype("float64") expected = vals[1] with tm.assert_produces_warning(FutureWarning): result = ser.index.get_value(ser, 4.0) assert isinstance(result, type(expected)) and result == expected with tm.assert_produces_warning(FutureWarning): result = ser.index.get_value(ser, 4) assert isinstance(result, type(expected)) and result == expected result = ser[4.0] assert isinstance(result, type(expected)) and result == expected result = ser[4] assert isinstance(result, type(expected)) and result == expected result = ser.loc[4.0] assert isinstance(result, type(expected)) and result == expected result = ser.loc[4] assert isinstance(result, type(expected)) and result == expected result = ser.at[4.0] assert isinstance(result, type(expected)) and result == expected # GH#31329 .at[4] should cast to 4.0, matching .loc behavior result = ser.at[4] assert isinstance(result, type(expected)) and result == expected result = ser.iloc[1] assert isinstance(result, type(expected)) and result == expected result = ser.iat[1] assert isinstance(result, type(expected)) and result == expected def test_doesnt_contain_all_the_things(self): i = Float64Index([np.nan]) assert not i.isin([0]).item() assert not i.isin([1]).item() assert i.isin([np.nan]).item() def test_nan_multiple_containment(self): i = Float64Index([1.0, np.nan]) tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False])) tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False])) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True])) tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True])) i = Float64Index([1.0, 2.0]) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) def test_fillna_float64(self): # GH 11343 idx = Index([1.0, np.nan, 3.0], dtype=float, name="x") # can't downcast exp = Index([1.0, 0.1, 3.0], name="x") tm.assert_index_equal(idx.fillna(0.1), exp) # downcast exp = Float64Index([1.0, 2.0, 3.0], name="x") tm.assert_index_equal(idx.fillna(2), exp) # object exp = Index([1.0, "obj", 3.0], name="x") tm.assert_index_equal(idx.fillna("obj"), exp)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series from pandas.core.arrays.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered if all(first.categories.equals(other.categories) for other in to_union[1:]): new_codes = np.concatenate([c.codes for c in to_union]) else: codes = [first.codes] + [_recode_for_categories(other.codes, other.categories, first.categories) for other in to_union[1:]] new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: new_codes.append(_recode_for_categories(c.codes, c.categories, categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def test_floating_misc(self): # related 236 # scalar/slicing of a float index s = Series(np.arange(5), index=np.arange(5) * 2.5, dtype=np.int64) # label based slicing result1 = s[1.0:3.0] result2 = s.loc[1.0:3.0] result3 = s.loc[1.0:3.0] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) # exact indexing when found result1 = s[5.0] result2 = s.loc[5.0] result3 = s.loc[5.0] assert result1 == result2 assert result1 == result3 result1 = s[5] result2 = s.loc[5] result3 = s.loc[5] assert result1 == result2 assert result1 == result3 assert s[5.0] == s[5] # value not found (and no fallbacking at all) # scalar integers with pytest.raises(KeyError, match=r"^4\.0$"): s.loc[4] with pytest.raises(KeyError, match=r"^4\.0$"): s.loc[4] with pytest.raises(KeyError, match=r"^4\.0$"): s[4] # fancy floats/integers create the correct entry (as nan) # fancy tests expected = Series([2, 0], index=Float64Index([5.0, 0.0])) for fancy_idx in [[5.0, 0.0], np.array([5.0, 0.0])]: # float tm.assert_series_equal(s[fancy_idx], expected) tm.assert_series_equal(s.loc[fancy_idx], expected) tm.assert_series_equal(s.loc[fancy_idx], expected) expected = Series([2, 0], index=Index([5, 0], dtype="int64")) for fancy_idx in [[5, 0], np.array([5, 0])]: # int tm.assert_series_equal(s[fancy_idx], expected) tm.assert_series_equal(s.loc[fancy_idx], expected) tm.assert_series_equal(s.loc[fancy_idx], expected) # all should return the same as we are slicing 'the same' result1 = s.loc[2:5] result2 = s.loc[2.0:5.0] result3 = s.loc[2.0:5] result4 = s.loc[2.1:5] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) tm.assert_series_equal(result1, result4) # previously this did fallback indexing result1 = s[2:5] result2 = s[2.0:5.0] result3 = s[2.0:5] result4 = s[2.1:5] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) tm.assert_series_equal(result1, result4) result1 = s.loc[2:5] result2 = s.loc[2.0:5.0] result3 = s.loc[2.0:5] result4 = s.loc[2.1:5] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) tm.assert_series_equal(result1, result4) # combined test result1 = s.loc[2:5] result2 = s.loc[2:5] result3 = s[2:5] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) # list selection result1 = s[[0.0, 5, 10]] result2 = s.loc[[0.0, 5, 10]] result3 = s.loc[[0.0, 5, 10]] result4 = s.iloc[[0, 2, 4]] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) tm.assert_series_equal(result1, result4) with pytest.raises(KeyError, match="with any missing labels"): s[[1.6, 5, 10]] with pytest.raises(KeyError, match="with any missing labels"): s.loc[[1.6, 5, 10]] with pytest.raises(KeyError, match="with any missing labels"): s[[0, 1, 2]] with pytest.raises(KeyError, match="with any missing labels"): s.loc[[0, 1, 2]] result1 = s.loc[[2.5, 5]] result2 = s.loc[[2.5, 5]] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, Series([1, 2], index=[2.5, 5.0])) result1 = s[[2.5]] result2 = s.loc[[2.5]] result3 = s.loc[[2.5]] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) tm.assert_series_equal(result1, Series([1], index=[2.5]))
def test_type_coercion_valid(self, float_dtype): # There is no Float32Index, so we always # generate Float64Index. i = Index([1, 2, 3.5], dtype=float_dtype) tm.assert_index_equal(i, Index([1, 2, 3.5]))
def test_identical(self): index = self.create_index() i = Index(index.copy()) assert i.identical(index) same_values_different_type = Index(i, dtype=object) assert not i.identical(same_values_different_type) i = index.copy(dtype=object) i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(i) assert not i.identical(index) assert Index(same_values, name="foo", dtype=object).identical(i) assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype))
def test_arg_passthru(): # make sure that we are passing thru kwargs # to our agg functions # GH3668 # GH5724 df = pd.DataFrame( { "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], "string": list("abc"), "category_string": pd.Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": pd.date_range("20130101", periods=3), "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), }, columns=[ "group", "int", "float", "string", "category_string", "category_int", "datetime", "datetimetz", "timedelta", ], ) expected_columns_numeric = Index(["int", "float", "category_int"]) # mean / median expected = pd.DataFrame( { "category_int": [7.5, 9], "float": [4.5, 6.0], "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], "int": [1.5, 3], "datetime": [ pd.Timestamp("2013-01-01 12:00:00"), pd.Timestamp("2013-01-03 00:00:00"), ], "datetimetz": [ pd.Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), pd.Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), ], }, index=Index([1, 2], name="group"), columns=[ "int", "float", "category_int", "datetime", "datetimetz", "timedelta" ], ) for attr in ["mean", "median"]: f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_frame_equal(result.reindex_like(expected), expected) # TODO: min, max *should* handle # categorical (ordered) dtype expected_columns = Index([ "int", "float", "string", "category_int", "datetime", "datetimetz", "timedelta", ]) for attr in ["min", "max"]: f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index([ "int", "float", "string", "category_string", "category_int", "datetime", "datetimetz", "timedelta", ]) for attr in ["first", "last"]: f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index( ["int", "float", "string", "category_int", "timedelta"]) for attr in ["sum"]: f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index(["int", "float", "category_int"]) for attr in ["prod", "cumprod"]: f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) # like min, max, but don't include strings expected_columns = Index([ "int", "float", "category_int", "datetime", "datetimetz", "timedelta" ]) for attr in ["cummin", "cummax"]: f = getattr(df.groupby("group"), attr) result = f() # GH 15561: numeric_only=False set by default like min/max tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index(["int", "float", "category_int", "timedelta"]) for attr in ["cumsum"]: f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns)
def test_type_coercion_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" with pytest.raises(ValueError, match=msg): Index([1, 2, 3.5], dtype=any_int_dtype)
def test_categorical_delegations(self): # invalid accessor pytest.raises(AttributeError, lambda: Series([1, 2, 3]).cat) tm.assert_raises_regex( AttributeError, r"Can only use .cat accessor with a 'category' dtype", lambda: Series([1, 2, 3]).cat) pytest.raises(AttributeError, lambda: Series(['a', 'b', 'c']).cat) pytest.raises(AttributeError, lambda: Series(np.arange(5.)).cat) pytest.raises(AttributeError, lambda: Series([Timestamp('20130101')]).cat) # Series should delegate calls to '.categories', '.codes', '.ordered' # and the methods '.set_categories()' 'drop_unused_categories()' to the # categorical# -*- coding: utf-8 -*- s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["a", "b", "c"]) tm.assert_index_equal(s.cat.categories, exp_categories) s.cat.categories = [1, 2, 3] exp_categories = Index([1, 2, 3]) tm.assert_index_equal(s.cat.categories, exp_categories) exp_codes = Series([0, 1, 2, 0], dtype='int8') tm.assert_series_equal(s.cat.codes, exp_codes) assert s.cat.ordered s = s.cat.as_unordered() assert not s.cat.ordered s.cat.as_ordered(inplace=True) assert s.cat.ordered # reorder s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) s = s.cat.set_categories(["c", "b", "a"]) tm.assert_index_equal(s.cat.categories, exp_categories) tm.assert_numpy_array_equal(s.values.__array__(), exp_values) tm.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories s = Series( Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"])) exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() tm.assert_index_equal(s.cat.categories, exp_categories) tm.assert_numpy_array_equal(s.values.__array__(), exp_values) tm.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error # on wrong inputs: def f(): s.set_categories([4, 3, 2, 1]) pytest.raises(Exception, f) # right: s.cat.set_categories([4,3,2,1]) # GH18862 (let Series.cat.rename_categories take callables) s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) result = s.cat.rename_categories(lambda x: x.upper()) expected = Series( Categorical(["A", "B", "C", "A"], categories=["A", "B", "C"], ordered=True)) tm.assert_series_equal(result, expected)
def test_intersection(self, sort): # intersect with Int64Index index = RangeIndex(start=0, stop=20, step=2) other = Index(np.arange(1, 6)) result = index.intersection(other, sort=sort) expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(index, sort=sort) expected = Index( np.sort(np.asarray(np.intersect1d(index.values, other.values))) ) tm.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) result = index.intersection(other, sort=sort) expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) result = index.intersection(other, sort=sort) expected = Index(np.sort(np.intersect1d(index.values, other.values))) tm.assert_index_equal(result, expected) # reversed (GH 17296) result = other.intersection(index, sort=sort) tm.assert_index_equal(result, expected) # GH 17296: intersect two decreasing RangeIndexes first = RangeIndex(10, -2, -2) other = RangeIndex(5, -4, -1) expected = first.astype(int).intersection(other.astype(int), sort=sort) result = first.intersection(other, sort=sort).astype(int) tm.assert_index_equal(result, expected) # reversed result = other.intersection(first, sort=sort).astype(int) tm.assert_index_equal(result, expected) index = RangeIndex(5, name="foo") # intersect of non-overlapping indices other = RangeIndex(5, 10, 1, name="foo") result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1, name="foo") tm.assert_index_equal(result, expected) other = RangeIndex(-1, -5, -1) result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) # intersection of empty indices other = RangeIndex(0, 0, 1) result = index.intersection(other, sort=sort) expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) result = other.intersection(index, sort=sort) tm.assert_index_equal(result, expected)
def test_strftime(self): # GH 10086 s = Series(date_range("20130101", periods=5)) result = s.dt.strftime("%Y/%m/%d") expected = Series( ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) s = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) result = s.dt.strftime("%Y/%m/%d %H-%M-%S") expected = Series( [ "2015/02/03 11-22-33", "2015/02/04 11-22-33", "2015/02/05 11-22-33", "2015/02/06 11-22-33", "2015/02/07 11-22-33", ] ) tm.assert_series_equal(result, expected) s = Series(period_range("20130101", periods=5)) result = s.dt.strftime("%Y/%m/%d") expected = Series( ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) result = s.dt.strftime("%Y/%m/%d %H-%M-%S") expected = Series( [ "2015/02/03 11-22-33", "2015/02/03 11-22-34", "2015/02/03 11-22-35", "2015/02/03 11-22-36", "2015/02/03 11-22-37", ] ) tm.assert_series_equal(result, expected) s = Series(date_range("20130101", periods=5)) s.iloc[0] = pd.NaT result = s.dt.strftime("%Y/%m/%d") expected = Series( [np.nan, "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] ) tm.assert_series_equal(result, expected) datetime_index = date_range("20150301", periods=5) result = datetime_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], dtype="=U10", ) tm.assert_index_equal(result, expected) s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) result = s.dt.strftime("%Y-%m-%d %H:%M:%S") expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"]) tm.assert_series_equal(result, expected) s = Series(period_range("20130101", periods=4, freq="H")) result = s.dt.strftime("%Y/%m/%d %H:%M:%S") expected = Series( [ "2013/01/01 00:00:00", "2013/01/01 01:00:00", "2013/01/01 02:00:00", "2013/01/01 03:00:00", ] ) s = Series(period_range("20130101", periods=4, freq="L")) result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l") expected = Series( [ "2013/01/01 00:00:00.000", "2013/01/01 00:00:00.001", "2013/01/01 00:00:00.002", "2013/01/01 00:00:00.003", ] ) tm.assert_series_equal(result, expected)
class Grouping(object): def __init__(self, index, names=None): """ index : index-like Can be pandas MultiIndex or Index or array-like. If array-like and is a MultipleIndex (more than one grouping variable), groups are expected to be in each row. E.g., [('red', 1), ('red', 2), ('green', 1), ('green', 2)] names : list or str, optional The names to use for the groups. Should be a str if only one grouping variable is used. Notes ----- If index is already a pandas Index then there is no copy. """ if isinstance(index, (Index, MultiIndex)): if names is not None: if hasattr(index, 'set_names'): # newer pandas index.set_names(names, inplace=True) else: index.names = names self.index = index else: # array_like if _is_hierarchical(index): self.index = _make_hierarchical_index(index, names) else: self.index = Index(index, name=names) if names is None: names = _make_generic_names(self.index) if hasattr(self.index, 'set_names'): self.index.set_names(names, inplace=True) else: self.index.names = names self.nobs = len(self.index) self.nlevels = len(self.index.names) self.slices = None @property def index_shape(self): if hasattr(self.index, 'levshape'): return self.index.levshape else: return self.index.shape @property def levels(self): if hasattr(self.index, 'levels'): return self.index.levels else: return pd.Categorical(self.index).levels @property def labels(self): # this was index_int, but that's not a very good name... codes = getattr(self.index, 'codes', None) if codes is None: if hasattr(self.index, 'labels'): codes = self.index.labels else: codes = pd.Categorical(self.index).codes[None] return codes @property def group_names(self): return self.index.names def reindex(self, index=None, names=None): """ Resets the index in-place. """ # NOTE: this isn't of much use if the rest of the data doesn't change # This needs to reset cache if names is None: names = self.group_names self = Grouping(index, names) def get_slices(self, level=0): """ Sets the slices attribute to be a list of indices of the sorted groups for the first index level. I.e., self.slices[0] is the index where each observation is in the first (sorted) group. """ # TODO: refactor this groups = self.index.get_level_values(level).unique() groups = np.array(groups) groups.sort() if isinstance(self.index, MultiIndex): self.slices = [ self.index.get_loc_level(x, level=level)[0] for x in groups ] else: self.slices = [self.index.get_loc(x) for x in groups] def count_categories(self, level=0): """ Sets the attribute counts to equal the bincount of the (integer-valued) labels. """ # TODO: refactor this not to set an attribute. Why would we do this? self.counts = np.bincount(self.labels[level]) def check_index(self, is_sorted=True, unique=True, index=None): """Sanity checks""" if not index: index = self.index if is_sorted: test = pd.DataFrame(lrange(len(index)), index=index) test_sorted = test.sort() if not test.index.equals(test_sorted.index): raise Exception('Data is not be sorted') if unique: if len(index) != len(index.unique()): raise Exception('Duplicate index entries') def sort(self, data, index=None): """Applies a (potentially hierarchical) sort operation on a numpy array or pandas series/dataframe based on the grouping index or a user-supplied index. Returns an object of the same type as the original data as well as the matching (sorted) Pandas index. """ if index is None: index = self.index if data_util._is_using_ndarray_type(data, None): if data.ndim == 1: out = pd.Series(data, index=index, copy=True) out = out.sort_index() else: out = pd.DataFrame(data, index=index) out = out.sort_index(inplace=False) # copies return np.array(out), out.index elif data_util._is_using_pandas(data, None): out = data out = out.reindex(index) # copies? out = out.sort_index() return out, out.index else: msg = 'data must be a Numpy array or a Pandas Series/DataFrame' raise ValueError(msg) def transform_dataframe(self, dataframe, function, level=0, **kwargs): """Apply function to each column, by group Assumes that the dataframe already has a proper index""" if dataframe.shape[0] != self.nobs: raise Exception('dataframe does not have the same shape as index') out = dataframe.groupby(level=level).apply(function, **kwargs) if 1 in out.shape: return np.ravel(out) else: return np.array(out) def transform_array(self, array, function, level=0, **kwargs): """Apply function to each column, by group """ if array.shape[0] != self.nobs: raise Exception('array does not have the same shape as index') dataframe = pd.DataFrame(array, index=self.index) return self.transform_dataframe(dataframe, function, level=level, **kwargs) def transform_slices(self, array, function, level=0, **kwargs): """Apply function to each group. Similar to transform_array but does not coerce array to a DataFrame and back and only works on a 1D or 2D numpy array. function is called function(group, group_idx, **kwargs). """ array = np.asarray(array) if array.shape[0] != self.nobs: raise Exception('array does not have the same shape as index') # always reset because level is given. need to refactor this. self.get_slices(level=level) processed = [] for s in self.slices: if array.ndim == 2: subset = array[s, :] elif array.ndim == 1: subset = array[s] processed.append(function(subset, s, **kwargs)) processed = np.array(processed) return processed.reshape(-1, processed.shape[-1]) # TODO: this isn't general needs to be a PanelGrouping object def dummies_time(self): self.dummy_sparse(level=1) return self._dummies def dummies_groups(self, level=0): self.dummy_sparse(level=level) return self._dummies def dummy_sparse(self, level=0): """create a sparse indicator from a group array with integer labels Parameters ---------- groups: ndarray, int, 1d (nobs,) an array of group indicators for each observation. Group levels are assumed to be defined as consecutive integers, i.e. range(n_groups) where n_groups is the number of group levels. A group level with no observations for it will still produce a column of zeros. Returns ------- indi : ndarray, int8, 2d (nobs, n_groups) an indicator array with one row per observation, that has 1 in the column of the group level for that observation Examples -------- >>> g = np.array([0, 0, 2, 1, 1, 2, 0]) >>> indi = dummy_sparse(g) >>> indi <7x3 sparse matrix of type '<type 'numpy.int8'>' with 7 stored elements in Compressed Sparse Row format> >>> indi.todense() matrix([[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]], dtype=int8) current behavior with missing groups >>> g = np.array([0, 0, 2, 0, 2, 0]) >>> indi = dummy_sparse(g) >>> indi.todense() matrix([[1, 0, 0], [1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [1, 0, 0]], dtype=int8) """ indi = dummy_sparse(self.labels[level]) self._dummies = indi