def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() klass = type(o) values = o._values if isinstance(values, Index): # reset name not to affect latter process values.name = None # create repeated values, 'n'th element is repeated by n+1 times # skip boolean, because it only has 2 values at most if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): expected_index = Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = "a" else: expected_index = Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) # take-based repeat indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) rep = values.take(indices) o = klass(rep, index=idx, name="a") # check values has the same dtype as the original assert o.dtype == orig.dtype expected_s = Series( range(10, 0, -1), index=expected_index, dtype="int64", name="a" ) result = o.value_counts() tm.assert_series_equal(result, expected_s) assert result.index.name is None assert result.name == "a" result = o.unique() if isinstance(o, Index): assert isinstance(result, type(o)) tm.assert_index_equal(result, orig) assert result.dtype == orig.dtype elif is_datetime64tz_dtype(o): # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: assert isinstance(r, Timestamp) tm.assert_numpy_array_equal( result.astype(object), orig._values.astype(object) ) else: tm.assert_numpy_array_equal(result, orig.values) assert result.dtype == orig.dtype assert o.nunique() == len(np.unique(o.values))
def test_index(self) -> None: expected = PandasIndex(PIndex([0, 1, 2], name="index"), []) assert_that(self.data_backend.index.equals(expected), equal_to(True)) new_frame = self.data_backend.set_index(ExampleStore.ab_index) pindex = PIndex([("a", 1), ("b", 2), ("c", 3)]) pindex.name = "ab_index" expected = PandasIndex(pindex, ["a", "b"]) assert_that(new_frame.index.equals(expected), equal_to(True))
def test_intersection_name_preservation(self, index2, keeps_name, sort): index1 = Index([1, 2, 3, 4, 5], name="index") expected = Index([3, 4, 5]) result = index1.intersection(index2, sort) if keeps_name: expected.name = "index" assert result.name == expected.name tm.assert_index_equal(result, expected)
def test_value_counts_unique_nunique(self, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() klass = type(obj) values = obj._values if orig.duplicated().any(): pytest.xfail( "The test implementation isn't flexible enough to deal " "with duplicated values. This isn't a bug in the " "application code, but in the test code.") # create repeated values, 'n'th element is repeated by n+1 times if isinstance(obj, Index): expected_index = Index(obj[::-1]) expected_index.name = None obj = obj.repeat(range(1, len(obj) + 1)) else: expected_index = Index(values[::-1]) idx = obj.index.repeat(range(1, len(obj) + 1)) # take-based repeat indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1)) rep = values.take(indices) obj = klass(rep, index=idx) # check values has the same dtype as the original assert obj.dtype == orig.dtype expected_s = Series(range(len(orig), 0, -1), index=expected_index, dtype="int64") result = obj.value_counts() tm.assert_series_equal(result, expected_s) assert result.index.name is None result = obj.unique() if isinstance(obj, Index): assert isinstance(result, type(obj)) tm.assert_index_equal(result, orig) assert result.dtype == orig.dtype elif is_datetime64tz_dtype(obj): # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: assert isinstance(r, Timestamp) tm.assert_numpy_array_equal(result.astype(object), orig._values.astype(object)) else: tm.assert_numpy_array_equal(result, orig.values) assert result.dtype == orig.dtype # dropna=True would break for MultiIndex assert obj.nunique(dropna=False) == len(np.unique(obj.values))
def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() klass = type(o) values = o._values if isinstance(values, Index): # reset name not to affect latter process values.name = None # create repeated values, 'n'th element is repeated by n+1 times # skip boolean, because it only has 2 values at most if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): expected_index = Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) # take-based repeat indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) rep = values.take(indices) o = klass(rep, index=idx, name='a') # check values has the same dtype as the original assert o.dtype == orig.dtype expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') result = o.value_counts() tm.assert_series_equal(result, expected_s) assert result.index.name is None assert result.name == 'a' result = o.unique() if isinstance(o, Index): assert isinstance(result, o.__class__) tm.assert_index_equal(result, orig) elif is_datetime64tz_dtype(o): # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: assert isinstance(r, Timestamp) tm.assert_numpy_array_equal( result.astype(object), orig._values.astype(object)) else: tm.assert_numpy_array_equal(result, orig.values) assert o.nunique() == len(np.unique(o.values))
def test_intersection_monotonic(self, index2, keeps_name, sort): index1 = Index([5, 3, 2, 4, 1], name="index") expected = Index([5, 3, 4]) if keeps_name: expected.name = "index" result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected)
def _read_index_node(self, node): data = node[:] kind = node._v_attrs.kind name = None if 'name' in node._v_attrs: name = node._v_attrs.name index = Index(_unconvert_index(data, kind)) index.name = name return name, index
def _read_index_node(self, node): data = node[:] kind = node._v_attrs.kind try: name = node._v_attrs.name except Exception: name = None index = Index(_unconvert_index(data, kind)) index.name = name return name, index
def from_pandas_index(cls, index: pd.Index, dim: Hashable): from .variable import IndexVariable if index.name is None: name = dim index = index.copy() index.name = dim else: name = index.name data = PandasIndexingAdapter(index) index_var = IndexVariable(dim, data, fastpath=True) return cls(index, dim), {name: index_var}
def _read_index_node(self, node): data = node[:] kind = node._v_attrs.kind name = None if 'name' in node._v_attrs: name = node._v_attrs.name if kind in ('date', 'datetime'): index = Index(_unconvert_index(data, kind), dtype=object) else: index = Index(_unconvert_index(data, kind)) index.name = name return name, index
def test_constructor_name(self): # GH#12288 orig = RangeIndex(10) orig.name = "original" copy = RangeIndex(orig) copy.name = "copy" assert orig.name == "original" assert copy.name == "copy" new = Index(copy) assert new.name == "copy" new.name = "new" assert orig.name == "original" assert copy.name == "copy" assert new.name == "new"
def test_constructor_name(self): # GH12288 orig = RangeIndex(10) orig.name = 'original' copy = RangeIndex(orig) copy.name = 'copy' assert orig.name == 'original' assert copy.name == 'copy' new = Index(copy) assert new.name == 'copy' new.name = 'new' assert orig.name == 'original' assert copy.name == 'copy' assert new.name == 'new'
def test_constructor_name(self): # GH12288 orig = RangeIndex(10) orig.name = 'original' copy = RangeIndex(orig) copy.name = 'copy' self.assertTrue(orig.name, 'original') self.assertTrue(copy.name, 'copy') new = Index(copy) self.assertTrue(new.name, 'copy') new.name = 'new' self.assertTrue(orig.name, 'original') self.assertTrue(new.name, 'copy') self.assertTrue(new.name, 'new')
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) values = o._ndarray_values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == o.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: if is_datetimetz(o): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # check values has the same dtype as the original assert o.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: exp = Series(nanloc, o.index, name='a') tm.assert_series_equal(pd.isna(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == 'a' result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == 'a' result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan vals = values[2:].astype(object).values tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert o.nunique() == 8 assert o.nunique(dropna=False) == 9
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() klass = type(obj) values = obj._ndarray_values num_values = len(orig) if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): pytest.skip(f"values of {klass} cannot be changed") elif isinstance(orig, pd.MultiIndex): pytest.skip("MultiIndex doesn't support isna") elif orig.duplicated().any(): pytest.xfail( "The test implementation isn't flexible enough to deal " "with duplicated values. This isn't a bug in the " "application code, but in the test code.") # special assign to the numpy array if is_datetime64tz_dtype(obj): if isinstance(obj, DatetimeIndex): v = obj.asi8 v[0:2] = iNaT values = obj._shallow_copy(v) else: obj = obj.copy() obj[0:2] = pd.NaT values = obj._values elif needs_i8_conversion(obj): values[0:2] = iNaT values = obj._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == obj.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(obj, (DatetimeIndex, PeriodIndex)): expected_index = obj.copy() expected_index.name = None # attach name to klass obj = klass(values.repeat(range(1, len(obj) + 1))) obj.name = "a" else: if isinstance(obj, DatetimeIndex): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None obj = obj.repeat(range(1, len(obj) + 1)) obj.name = "a" # check values has the same dtype as the original assert obj.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(obj), dtype=np.bool) nanloc[:3] = True if isinstance(obj, Index): tm.assert_numpy_array_equal(pd.isna(obj), nanloc) else: exp = Series(nanloc, obj.index, name="a") tm.assert_series_equal(pd.isna(obj), exp) expected_data = list(range(num_values, 2, -1)) expected_data_na = expected_data.copy() if expected_data_na: expected_data_na.append(3) expected_s_na = Series( expected_data_na, index=expected_index[num_values - 1:0:-1], dtype="int64", name="a", ) expected_s = Series( expected_data, index=expected_index[num_values - 1:1:-1], dtype="int64", name="a", ) result_s_na = obj.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == "a" result_s = obj.value_counts() tm.assert_series_equal(obj.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == "a" result = obj.unique() if isinstance(obj, Index): tm.assert_index_equal(result, Index(values[1:], name="a")) elif is_datetime64tz_dtype(obj): # unable to compare NaT / nan tm.assert_extension_array_equal(result[1:], values[2:]) assert result[0] is pd.NaT elif len(obj) > 0: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert obj.nunique() == max(0, num_values - 2) assert obj.nunique(dropna=False) == max(0, num_values - 1)
def replace_multi_index_level( df: "classes.BeliefsDataFrame", level: str, index: pd.Index, intersection: bool = False, ) -> "classes.BeliefsDataFrame": """Replace one of the index levels of the multi-indexed DataFrame. Returns a new DataFrame object. :param df: a BeliefsDataFrame (or just a multi-indexed DataFrame). :param level: the name of the index level to replace. :param index: the new index. :param intersection: policy for replacing the index level. If intersection is False then simply replace (note that the new index should have the same length as the old index). If intersection is True then add indices not contained in the old index and delete indices not contained in the new index. New rows have nan columns values and copies of the first row for other index levels (note that the resulting index is usually longer and contains values that were both in the old and new index, i.e. the intersection). """ # Todo: check whether timezone information is copied over correctly # Check input if intersection is False and len(index) != len(df.index): raise ValueError( "Cannot simply replace multi-index level with an index of different length than the original. " "Use intersection instead?") if index.name is None: index.name = level new_index_values = [] new_index_names = [] if intersection is True: contained_in_old = index.isin(df.index.get_level_values(level)) new_index_not_in_old = index[~contained_in_old] contained_in_new = df.index.get_level_values(level).isin(index) for i in df.index.names: if i == level: # For the index level that should be replaced # Copy old values that the new index contains, and add new values that the old index does not contain new_index_values.append( df.index.get_level_values(i)[contained_in_new].append( new_index_not_in_old)) new_index_names.append(index.name) else: # For the other index levels # Copy old values that the new index contains, and add the first value to the new rows new_row_values = pd.Index([df.index.get_level_values(i)[0]] * len(new_index_not_in_old)) new_index_values.append( df.index.get_level_values(i)[contained_in_new].append( new_row_values)) new_index_names.append(i) else: for i in df.index.names: if i == level: # For the index level that should be replaced # Replace with new index new_index_values.append(index) new_index_names.append(index.name) else: # For the other index levels # Copy all old values new_index_values.append(df.index.get_level_values(i)) new_index_names.append(i) # Construct new MultiIndex mux = pd.MultiIndex.from_arrays(new_index_values, names=new_index_names) df = df.copy(deep=True) # Apply new MultiIndex if intersection is True: # Reindex such that new rows get nan column values df = df.reindex(mux) else: # Replace the index df.index = mux return df.sort_index()