def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected)
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), per=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), B=np.arange(3).astype(np.int64))), mixed_dup=mixed_dup_df) mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A']), mixed_dup=mixed_dup_panel) return dict(series=series, frame=frame, panel=panel, index=index, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()))
def test_mangles_multi_index(self): # See GH 18062 data = """A,A,A,B\none,one,one,two\n0,40,34,0.1""" df = self.read_csv(StringIO(data), header=[0, 1]) expected = DataFrame([[0, 40, 34, 0.1]], columns=MultiIndex.from_tuples( [('A', 'one'), ('A', 'one.1'), ('A', 'one.2'), ('B', 'two')])) tm.assert_frame_equal(df, expected) data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1""" df = self.read_csv(StringIO(data), header=[0, 1]) expected = DataFrame([[0, 40, 34, 0.1]], columns=MultiIndex.from_tuples( [('A', 'one'), ('A', 'one.1'), ('A', 'one.1.1'), ('B', 'two')])) tm.assert_frame_equal(df, expected) data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1""" df = self.read_csv(StringIO(data), header=[0, 1]) expected = DataFrame([[0, 40, 34, 0.1, 0.1]], columns=MultiIndex.from_tuples( [('A', 'one'), ('A', 'one.1'), ('A', 'one.1.1'), ('B', 'two'), ('B', 'two.1')])) tm.assert_frame_equal(df, expected)
def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices index = MultiIndex.from_tuples([t for t in itertools.product( [6, 7, 8], ['a', 'b'])]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([t for t in itertools.product( [10, 20, 30], ['a', 'b'])]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] tm.assert_frame_equal(result, expected) # doc examples result = df.loc[10, :] expected = df.iloc[0:2] expected.index = ['a', 'b'] tm.assert_frame_equal(result, expected) result = df.loc[:, 10] expected = df[10] tm.assert_frame_equal(result, expected)
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101')) if LooseVersion(pandas.__version__) >= '0.17.0': scalars['period'] = Period('2012','M') index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), <<<<<<< HEAD <<<<<<< HEAD cat=Series(Categorical(['foo', 'bar', 'baz']))) if LooseVersion(pandas.__version__) >= '0.17.0': series['period'] = Series([Period('2000Q1')] * 5)
def test_get_info_after_update(chunkstore_lib): df = DataFrame(data={'data': [1.1, 2.1, 3.1]}, index=MultiIndex.from_tuples([(dt(2016, 1, 1), 1), (dt(2016, 1, 2), 1), (dt(2016, 1, 3), 1)], names=['date', 'id']) ) chunkstore_lib.write('test_df', df, 'D') df2 = DataFrame(data={'data': [1.1, 1.1, 1.1]}, index=MultiIndex.from_tuples([(dt(2016, 1, 1), 2), (dt(2016, 1, 2), 2), (dt(2016, 1, 4), 1)], names=['date', 'id']) ) chunkstore_lib.update('test_df', df2) assert_frame_equal(chunkstore_lib.read('test_df'), pd.concat([df, df2]).sort()) info = {'rows': 6, 'dtype': [('date', '<M8[ns]'), ('id', '<i8'), ('data', '<f8')], 'chunk_count': 4, 'col_names': {u'index': [u'date', u'id'], u'index_tz': [None, None], u'columns': [u'data']}, 'type': u'df', 'size': 144} assert(chunkstore_lib.get_info('test_df') == info)
def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index mi = MultiIndex.from_tuples([ [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) expected_mi = MultiIndex.from_tuples([ [1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list('ABC')) expected = pd.DataFrame([ [5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) assert_frame_equal(result, expected) # sort_remaining=False expected_mi = MultiIndex.from_tuples([ [1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list('ABC')) expected = pd.DataFrame([ [5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected)
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected) # From a mixed type dataframe df["A"] = df["A"].astype(np.int16) df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) expected["A"] = expected["A"].astype(np.int16) expected["B"] = expected["B"].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]) assert_frame_equal(result, expected)
def TreeMatrix(D,desc,L, Env=None,DshapeLarge=True): """ Applying tree information (desc,L) on a given count matrix (D) and columns grouping (Env) to obtain a matrix of count over the tree """ #I assume that D has correct columns name if not DshapeLarge: Z=(D[["Sample","Taxon"]]).values Z=MultiIndex.from_tuples(map(tuple,tuple(Z)), names=["Sample","Taxon"]) D.index=Z Dlarge=D.Count.unstack(level=0) Dlarge.fillna(value=0,inplace=True) #I assume that Environment has correct index and columns names ExperimentalDesignColumns=MultiIndex.from_tuples( map(tuple,tuple(Env.ix[Dlarge.columns].values)) , names=["Sample","Group"]) else: # if D is already Large Environment information is already included Dlarge=D ExperimentalDesignColumns=Dlarge.columns #if taxon only present in tree but not in table, access mode .ix correctly report NA for that line, that later will be converted to zero. NodeTableLarge=[[x[0],Dlarge.ix[x[-1]].sum()] for x in desc] Dtree=DataFrame.from_items(NodeTableLarge).transpose() NodeAndLeafNamesIndex=MultiIndex.from_tuples( map(tuple,tuple(L.loc[:,["Name","Is_Leaf"]].ix[Dtree.index].values)) , names=["Name","Is_Leaf"]) Dtree.index=NodeAndLeafNamesIndex Dtree.columns=ExperimentalDesignColumns Dtree.columns=Dtree.columns.reorder_levels(["Group", "Sample"]) return Dtree
def test_na_value_dict(self): data = """A,B,C foo,bar,NA bar,foo,foo foo,bar,NA bar,foo,foo""" df = read_csv(StringIO(data), na_values={'A': ['foo'], 'B': ['bar']}) expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], 'B': [np.nan, 'foo', np.nan, 'foo'], 'C': [np.nan, 'foo', np.nan, 'foo']}) assert_frame_equal(df, expected) data = """\ a,b,c,d 0,NA,1,5 """ xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) xp.index.name = 'a' df = read_csv(StringIO(data), na_values={}, index_col=0) assert_frame_equal(df, xp) xp = DataFrame({'b': [np.nan], 'd': [5]}, MultiIndex.from_tuples([(0, 1)])) df = read_csv(StringIO(data), na_values={}, index_col=[0, 2]) assert_frame_equal(df, xp) xp = DataFrame({'b': [np.nan], 'd': [5]}, MultiIndex.from_tuples([(0, 1)])) df = read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) assert_frame_equal(df, xp)
def test_groupby_as_index_apply(df): # GH #4648 and #3417 df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], 'user_id': [1, 2, 1, 1, 3, 1], 'time': range(6)}) g_as = df.groupby('user_id', as_index=True) g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index res_not_as = g_not_as.head(2).index exp = Index([0, 1, 2, 4]) tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) res_as_apply = g_as.apply(lambda x: x.head(2)).index res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( 2, 4)]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) ind = Index(list('abcde')) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index tm.assert_index_equal(res, ind)
def create_data(): """ create the pickle data """ import numpy as np import pandas from pandas import (Series,TimeSeries,DataFrame,Panel, SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, Index,MultiIndex,PeriodIndex, date_range,period_range,bdate_range,Timestamp) nan = np.nan data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E' : [0., 1, Timestamp('20100101'),'foo',2.], } index = dict(int = Index(np.arange(10)), date = date_range('20130101',periods=10), period = period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float = Series(data['A']), int = Series(data['B']), mixed = Series(data['E']), ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)), mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), names=['one','two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A'])) frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])), mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], ['one','two','one','two','three']])), names=['first','second'])), dup = DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A'])) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) return dict( series = series, frame = frame, panel = panel, index = index, mi = mi, sp_series = dict(float = _create_sp_series(), ts = _create_sp_tsseries()), sp_frame = dict(float = _create_sp_frame()) )
def test_getitem_bool_index_all(ind1, ind2): # GH#22533 idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)]) tm.assert_index_equal(idx[ind1], idx) expected = MultiIndex.from_tuples([(10, 1), (30, 3)]) tm.assert_index_equal(idx[ind2], expected)
def test_drop(idx): dropped = idx.drop([('foo', 'two'), ('qux', 'one')]) index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) dropped2 = idx.drop(index) expected = idx[[0, 2, 3, 5]] tm.assert_index_equal(dropped, expected) tm.assert_index_equal(dropped2, expected) dropped = idx.drop(['bar']) expected = idx[[0, 1, 3, 4, 5]] tm.assert_index_equal(dropped, expected) dropped = idx.drop('foo') expected = idx[[2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) index = MultiIndex.from_tuples([('bar', 'two')]) with pytest.raises(KeyError, match=r"^10$"): idx.drop([('bar', 'two')]) with pytest.raises(KeyError, match=r"^10$"): idx.drop(index) with pytest.raises(KeyError, match=r"^'two'$"): idx.drop(['foo', 'two']) # partially correct argument mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')]) with pytest.raises(KeyError, match=r"^10$"): idx.drop(mixed_index) # error='ignore' dropped = idx.drop(index, errors='ignore') expected = idx[[0, 1, 2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) dropped = idx.drop(mixed_index, errors='ignore') expected = idx[[0, 1, 2, 3, 5]] tm.assert_index_equal(dropped, expected) dropped = idx.drop(['foo', 'two'], errors='ignore') expected = idx[[2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) # mixed partial / full drop dropped = idx.drop(['foo', ('qux', 'one')]) expected = idx[[2, 3, 5]] tm.assert_index_equal(dropped, expected) # mixed partial / full drop / error='ignore' mixed_index = ['foo', ('qux', 'one'), 'two'] with pytest.raises(KeyError, match=r"^'two'$"): idx.drop(mixed_index) dropped = idx.drop(mixed_index, errors='ignore') expected = idx[[2, 3, 5]] tm.assert_index_equal(dropped, expected)
def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) result = df.stack() eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) assert_frame_equal(result, expected)
def test_boolean_context_compat2(): # boolean context compat # GH7897 i1 = MultiIndex.from_tuples([('A', 1), ('A', 2)]) i2 = MultiIndex.from_tuples([('A', 1), ('A', 3)]) common = i1.intersection(i2) with pytest.raises(ValueError): bool(common)
def test_indexing_ambiguity_bug_1678(): # GH 1678 columns = MultiIndex.from_tuples( [('Ohio', 'Green'), ('Ohio', 'Red'), ('Colorado', 'Green')]) index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]) df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns) result = df.iloc[:, 1] expected = df.loc[:, ('Ohio', 'Red')] tm.assert_series_equal(result, expected)
def test_multiindex_get(self): ind = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["first", "second"]) wp = Panel(np.random.random((4, 5, 5)), items=ind, major_axis=np.arange(5), minor_axis=np.arange(5)) f1 = wp["a"] f2 = wp.ix["a"] assert_panel_equal(f1, f2) self.assert_((f1.items == [1, 2]).all()) self.assert_((f2.items == [1, 2]).all()) ind = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)], names=["first", "second"])
def test_from_tuples(): msg = 'Cannot infer number of levels from empty list' with pytest.raises(TypeError, match=msg): MultiIndex.from_tuples([]) expected = MultiIndex(levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=['a', 'b']) # input tuples result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) tm.assert_index_equal(result, expected)
def test_sort_index_multiindex(self): # GH13496 # sort rows by specified level of multi-index mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) # MI sort, but no level: sort_level has no effect mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(sort_remaining=False) expected = df.sort_index() assert_frame_equal(result, expected)
def test_indexing_ambiguity_bug_1678(self): columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), ( 'Colorado', 'Green')]) index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2) ]) frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns) result = frame.iloc[:, 1] exp = frame.loc[:, ('Ohio', 'Red')] assert isinstance(result, Series) tm.assert_series_equal(result, exp)
def test_subclass_stack_multi_mixed(self): # GH 15564 df = tm.SubclassedDataFrame([ [10, 11, 12.0, 13.0], [20, 21, 22.0, 23.0], [30, 31, 32.0, 33.0], [40, 41, 42.0, 43.0]], index=MultiIndex.from_tuples( list(zip(list('AABB'), list('cdcd'))), names=['aaa', 'ccc']), columns=MultiIndex.from_tuples( list(zip(list('WWXX'), list('yzyz'))), names=['www', 'yyy'])) exp = tm.SubclassedDataFrame([ [10, 12.0], [11, 13.0], [20, 22.0], [21, 23.0], [30, 32.0], [31, 33.0], [40, 42.0], [41, 43.0]], index=MultiIndex.from_tuples(list(zip( list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), names=['aaa', 'ccc', 'yyy']), columns=Index(['W', 'X'], name='www')) res = df.stack() tm.assert_frame_equal(res, exp) res = df.stack('yyy') tm.assert_frame_equal(res, exp) exp = tm.SubclassedDataFrame([ [10.0, 11.0], [12.0, 13.0], [20.0, 21.0], [22.0, 23.0], [30.0, 31.0], [32.0, 33.0], [40.0, 41.0], [42.0, 43.0]], index=MultiIndex.from_tuples(list(zip( list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), names=['aaa', 'ccc', 'www']), columns=Index(['y', 'z'], name='yyy')) res = df.stack('www') tm.assert_frame_equal(res, exp)
def test_index_equal_values_mismatch(check_exact): msg = """MultiIndex level \\[1\\] are different MultiIndex level \\[1\\] values are different \\(25\\.0 %\\) \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) with pytest.raises(AssertionError, match=msg): assert_index_equal(idx1, idx2, check_exact=check_exact)
def test_from_tuples_iterator(): # GH 18434 # input iterator for tuples expected = MultiIndex(levels=[[1, 3], [2, 4]], labels=[[0, 1], [0, 1]], names=['a', 'b']) result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b']) tm.assert_index_equal(result, expected) # input non-iterables with tm.assert_raises_regex( TypeError, 'Input must be a list / sequence of tuple-likes.'): MultiIndex.from_tuples(0)
def test_iloc_getitem_panel_multiindex(self): with catch_warnings(record=True): # GH 7199 # Panel with multi-index multi_index = MultiIndex.from_tuples([('ONE', 'one'), ('TWO', 'two'), ('THREE', 'three')], names=['UPPER', 'lower']) simple_index = [x[0] for x in multi_index] wd1 = Panel(items=['First', 'Second'], major_axis=['a', 'b', 'c', 'd'], minor_axis=multi_index) wd2 = Panel(items=['First', 'Second'], major_axis=['a', 'b', 'c', 'd'], minor_axis=simple_index) expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG tm.assert_frame_equal(result1, expected1) expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] tm.assert_frame_equal(result2, expected2) expected1 = DataFrame(index=['a'], columns=multi_index, dtype='float64') result1 = wd1.iloc[0, [0], [0, 1, 2]] tm.assert_frame_equal(result1, expected1) expected2 = DataFrame(index=['a'], columns=simple_index, dtype='float64') result2 = wd2.iloc[0, [0], [0, 1, 2]] tm.assert_frame_equal(result2, expected2) # GH 7516 mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), items=['a', 'b', 'c'], major_axis=mi, minor_axis=['u', 'v', 'w']) result = p.iloc[:, 1, 0] expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') tm.assert_series_equal(result, expected) result = p.loc[:, (1, 'y'), 'u'] tm.assert_series_equal(result, expected)
def test_dateparser_resolution_if_not_ns(self): # GH 10245 data = """\ date,time,prn,rxstatus 2013-11-03,19:00:00,126,00E80000 2013-11-03,19:00:00,23,00E80000 2013-11-03,19:00:00,13,00E80000 """ def date_parser(date, time): datetime = np_array_datetime64_compat( date + 'T' + time + 'Z', dtype='datetime64[s]') return datetime df = self.read_csv(StringIO(data), date_parser=date_parser, parse_dates={'datetime': ['date', 'time']}, index_col=['datetime', 'prn']) datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, dtype='datetime64[s]') df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, index=MultiIndex.from_tuples( [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], names=['datetime', 'prn'])) tm.assert_frame_equal(df, df_correct)
def test_iloc_getitem_multiindex2(self): # TODO(wesm): fix this pytest.skip('this test was being suppressed, ' 'needs to be fixed') arr = np.random.randn(3, 3) df = DataFrame(arr, columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]]) rs = df.iloc[2] xp = Series(arr[2], index=df.columns) tm.assert_series_equal(rs, xp) rs = df.iloc[:, 2] xp = Series(arr[:, 2], index=df.index) tm.assert_series_equal(rs, xp) rs = df.iloc[2, 2] xp = df.values[2, 2] assert rs == xp # for multiple items # GH 5528 rs = df.iloc[[0, 1]] xp = df.xs(4, drop_level=False) tm.assert_frame_equal(rs, xp) tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) rs = df.iloc[[2, 3]] xp = df.xs('b', drop_level=False) tm.assert_frame_equal(rs, xp)
def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected)
def test_boxplot_legacy(self): grouped = self.hist_df.groupby(by='gender') with warnings.catch_warnings(): warnings.simplefilter('ignore') axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) tuples = lzip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) grouped = df.unstack(level=1).groupby(level=0, axis=1) axes = _check_plot_works(grouped.boxplot, return_type='axes') self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) axes = _check_plot_works(grouped.boxplot, subplots=False, return_type='axes') self._check_axes_shape(axes, axes_num=1, layout=(1, 1))
def test_drop_multiindex_not_lexsorted(self): # GH 11640 # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) self.assertTrue(lexsorted_df.columns.is_lexsorted()) # define the non-lexsorted version not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) not_lexsorted_df = not_lexsorted_df.pivot_table( index='a', columns=['b', 'c'], values='d') not_lexsorted_df = not_lexsorted_df.reset_index() self.assertFalse(not_lexsorted_df.columns.is_lexsorted()) # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) expected = lexsorted_df.drop('a', axis=1) with tm.assert_produces_warning(PerformanceWarning): result = not_lexsorted_df.drop('a', axis=1) tm.assert_frame_equal(result, expected)
def test_comprehensive(df_ext, environment): # test as many low level features simultaneously as possible cidx = MultiIndex.from_tuples([("Z", "a"), ("Z", "b"), ("Y", "c")]) ridx = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "c")]) df_ext.index, df_ext.columns = ridx, cidx stlr = df_ext.style stlr.set_caption("mycap") stlr.set_table_styles([ { "selector": "label", "props": ":{fig§item}" }, { "selector": "position", "props": ":h!" }, { "selector": "position_float", "props": ":centering" }, { "selector": "column_format", "props": ":rlrlr" }, { "selector": "toprule", "props": ":toprule" }, { "selector": "midrule", "props": ":midrule" }, { "selector": "bottomrule", "props": ":bottomrule" }, { "selector": "rowcolors", "props": ":{3}{pink}{}" }, # custom command ]) stlr.highlight_max(axis=0, props="textbf:--rwrap;cellcolor:[rgb]{1,1,0.6}--rwrap") stlr.highlight_max(axis=None, props="Huge:--wrap;", subset=[("Z", "a"), ("Z", "b")]) expected = ("""\ \\begin{table}[h!] \\centering \\caption{mycap} \\label{fig:item} \\rowcolors{3}{pink}{} \\begin{tabular}{rlrlr} \\toprule & & \\multicolumn{2}{r}{Z} & Y \\\\ & & a & b & c \\\\ \\midrule \\multirow[c]{2}{*}{A} & a & 0 & \\textbf{\\cellcolor[rgb]{1,1,0.6}{-0.61}} & ab \\\\ & b & 1 & -1.22 & cd \\\\ B & c & \\textbf{\\cellcolor[rgb]{1,1,0.6}{{\\Huge 2}}} & -2.22 & """ """\ \\textbf{\\cellcolor[rgb]{1,1,0.6}{de}} \\\\ \\bottomrule \\end{tabular} \\end{table} """).replace("table", environment if environment else "table") result = stlr.format(precision=2).to_latex(environment=environment) assert result == expected
def test_sortlevel_not_sort_remaining(): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) sorted_idx, _ = mi.sortlevel("A", sort_remaining=False) assert sorted_idx.equals(mi)
) result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), object, object, np.int64, object], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index", [ pd.CategoricalIndex(list("abc")), pd.interval_range(0, 3), pd.period_range("2020", periods=3, freq="D"), MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), ], ) def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) result = df.groupby("group").apply(lambda x: x) tm.assert_frame_equal(result, df) @pytest.mark.parametrize( "function, expected_values", [ (lambda x: x.index.to_list(), [[0, 1], [2, 3]]), (lambda x: set(x.index.to_list()), [{0, 1}, {2, 3}]), (lambda x: tuple(x.index.to_list()), [(0, 1), (2, 3)]),
def test_rename_mi(self): s = Series( [11, 21, 31], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), ) s.rename(str.lower)
def test_get_indexer_nearest(): midx = MultiIndex.from_tuples([('a', 1), ('b', 2)]) with pytest.raises(NotImplementedError): midx.get_indexer(['a'], method='nearest') with pytest.raises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2)
def test_where(): i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): i.where(True)
def test_from_tuples_index_values(idx): result = MultiIndex.from_tuples(idx) assert (result.values == idx.values).all()
def test_from_tuples_empty(): # GH 16777 result = MultiIndex.from_tuples([], names=["a", "b"]) expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) tm.assert_index_equal(result, expected)
def test_sort_index_nan_multiindex(self): # GH#14784 # incorrect sorting w.r.t. nans tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) df2 = DataFrame({ "date": pd.DatetimeIndex([ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ]), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, np.nan, 280, 259, np.nan, 623, 90, 312, np.nan, 301, 359, 801, ], "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], }).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position last result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) # sorting frame with removed rows result = df2.dropna().sort_index() expected = df2.sort_index().dropna() tm.assert_frame_equal(result, expected) # sorting series, default nan position is last result = s.sort_index() expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position last result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected)
class TestDataFrameSortIndex: def test_sort_index_and_reconstruction_doc_example(self): # doc example df = DataFrame( {"value": [1, 2, 3, 4]}, index=MultiIndex(levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) assert df.index._is_lexsorted() assert not df.index.is_monotonic # sort it expected = DataFrame( {"value": [2, 1, 4, 3]}, index=MultiIndex(levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = df.sort_index() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # reconstruct result = df.sort_index().copy() result.index = result.index._sort_levels_monotonic() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) def test_sort_index_non_existent_label_multiindex(self): # GH#12261 df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []])) df.loc["b", "2"] = 1 df.loc["a", "3"] = 1 result = df.sort_index().index.is_monotonic assert result is True def test_sort_index_reorder_on_ops(self): # GH#15687 df = DataFrame( np.random.randn(8, 2), index=MultiIndex.from_product( [["a", "b"], ["big", "small"], ["red", "blu"]], names=["letter", "size", "color"], ), columns=["near", "far"], ) df = df.sort_index() def my_func(group): group.index = ["newz", "newa"] return group result = df.groupby( level=["letter", "size"]).apply(my_func).sort_index() expected = MultiIndex.from_product( [["a", "b"], ["big", "small"], ["newa", "newz"]], names=["letter", "size", None], ) tm.assert_index_equal(result.index, expected) def test_sort_index_nan_multiindex(self): # GH#14784 # incorrect sorting w.r.t. nans tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) df2 = DataFrame({ "date": pd.DatetimeIndex([ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ]), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, np.nan, 280, 259, np.nan, 623, 90, 312, np.nan, 301, 359, 801, ], "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], }).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position last result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) # sorting frame with removed rows result = df2.dropna().sort_index() expected = df2.sort_index().dropna() tm.assert_frame_equal(result, expected) # sorting series, default nan position is last result = s.sort_index() expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position last result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected) def test_sort_index_nan(self): # GH#3917 # Test DataFrame with nan label df = DataFrame( { "A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5] }, index=[1, 2, 3, 4, 5, 6, np.nan], ) # NaN label, ascending=True, na_position='last' sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") expected = DataFrame( { "A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5] }, index=[1, 2, 3, 4, 5, 6, np.nan], ) tm.assert_frame_equal(sorted_df, expected) # NaN label, ascending=True, na_position='first' sorted_df = df.sort_index(na_position="first") expected = DataFrame( { "A": [4, 1, 2, np.nan, 1, 6, 8], "B": [5, 9, np.nan, 5, 2, 5, 4] }, index=[np.nan, 1, 2, 3, 4, 5, 6], ) tm.assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='last' sorted_df = df.sort_index(kind="quicksort", ascending=False) expected = DataFrame( { "A": [8, 6, 1, np.nan, 2, 1, 4], "B": [4, 5, 2, 5, np.nan, 9, 5] }, index=[6, 5, 4, 3, 2, 1, np.nan], ) tm.assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='first' sorted_df = df.sort_index(kind="quicksort", ascending=False, na_position="first") expected = DataFrame( { "A": [4, 8, 6, 1, np.nan, 2, 1], "B": [5, 4, 5, 2, 5, np.nan, 9] }, index=[np.nan, 6, 5, 4, 3, 2, 1], ) tm.assert_frame_equal(sorted_df, expected) def test_sort_index_multi_index(self): # GH#25775, testing that sorting by index works with a multi-index. df = DataFrame({ "a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc") }) result = df.set_index(list("abc")).sort_index(level=list("ba")) expected = DataFrame({ "a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca") }) expected = expected.set_index(list("abc")) tm.assert_frame_equal(result, expected) def test_sort_index_inplace(self): frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]) # axis=0 unordered = frame.loc[[3, 2, 4, 1]] a_id = id(unordered["A"]) df = unordered.copy() return_value = df.sort_index(inplace=True) assert return_value is None expected = frame tm.assert_frame_equal(df, expected) assert a_id != id(df["A"]) df = unordered.copy() return_value = df.sort_index(ascending=False, inplace=True) assert return_value is None expected = frame[::-1] tm.assert_frame_equal(df, expected) # axis=1 unordered = frame.loc[:, ["D", "B", "C", "A"]] df = unordered.copy() return_value = df.sort_index(axis=1, inplace=True) assert return_value is None expected = frame tm.assert_frame_equal(df, expected) df = unordered.copy() return_value = df.sort_index(axis=1, ascending=False, inplace=True) assert return_value is None expected = frame.iloc[:, ::-1] tm.assert_frame_equal(df, expected) def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) # test with multiindex, too idf = df.set_index(["A", "B"]) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) tm.assert_frame_equal(result, expected) # also, Series! result = idf["C"].sort_index(ascending=[1, 0]) tm.assert_series_equal(result, expected["C"]) def test_sort_index_level(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(level="A", sort_remaining=False) expected = df tm.assert_frame_equal(result, expected) result = df.sort_index(level=["A", "B"], sort_remaining=False) expected = df tm.assert_frame_equal(result, expected) # Error thrown by sort_index when # first index is sorted last (GH#26053) result = df.sort_index(level=["C", "B", "A"]) expected = df.iloc[[1, 0]] tm.assert_frame_equal(result, expected) result = df.sort_index(level=["B", "C", "A"]) expected = df.iloc[[1, 0]] tm.assert_frame_equal(result, expected) result = df.sort_index(level=["C", "A"]) expected = df.iloc[[1, 0]] tm.assert_frame_equal(result, expected) def test_sort_index_categorical_index(self): df = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), }).set_index("B") result = df.sort_index() expected = df.iloc[[4, 0, 1, 5, 2, 3]] tm.assert_frame_equal(result, expected) result = df.sort_index(ascending=False) expected = df.iloc[[2, 3, 0, 1, 5, 4]] tm.assert_frame_equal(result, expected) def test_sort_index(self): # GH#13496 frame = DataFrame( np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"], ) # axis=0 : sort rows by index labels unordered = frame.loc[[3, 2, 4, 1]] result = unordered.sort_index(axis=0) expected = frame tm.assert_frame_equal(result, expected) result = unordered.sort_index(ascending=False) expected = frame[::-1] tm.assert_frame_equal(result, expected) # axis=1 : sort columns by column names unordered = frame.iloc[:, [2, 1, 3, 0]] result = unordered.sort_index(axis=1) tm.assert_frame_equal(result, frame) result = unordered.sort_index(axis=1, ascending=False) expected = frame.iloc[:, ::-1] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 def test_sort_index_multiindex(self, level): # GH#13496 # sort rows by specified level of multi-index mi = MultiIndex.from_tuples([[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC")) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) expected_mi = MultiIndex.from_tuples([[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC")) expected = DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) tm.assert_frame_equal(result, expected) # sort_remaining=False expected_mi = MultiIndex.from_tuples([[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC")) expected = DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) tm.assert_frame_equal(result, expected) @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_sort_index_intervalindex(self): # this is a de-facto sort via unstack # confirming that we sort in the order of the bins y = Series(np.random.randn(100)) x1 = Series(np.sign(np.random.randn(100))) x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) result = model.groupby(["X1", "X2"], observed=True).mean().unstack() expected = IntervalIndex.from_tuples([(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right") result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_dict, sorted_dict, ascending, ignore_index, output_index", [ ({ "A": [1, 2, 3] }, { "A": [2, 3, 1] }, False, True, [0, 1, 2]), ({ "A": [1, 2, 3] }, { "A": [1, 3, 2] }, True, True, [0, 1, 2]), ({ "A": [1, 2, 3] }, { "A": [2, 3, 1] }, False, False, [5, 3, 2]), ({ "A": [1, 2, 3] }, { "A": [1, 3, 2] }, True, False, [2, 3, 5]), ], ) def test_sort_index_ignore_index(self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index): # GH 30114 original_index = [2, 5, 3] df = DataFrame(original_dict, index=original_index) expected_df = DataFrame(sorted_dict, index=output_index) kwargs = { "ascending": ascending, "ignore_index": ignore_index, "inplace": inplace, } if inplace: result_df = df.copy() result_df.sort_index(**kwargs) else: result_df = df.sort_index(**kwargs) tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_dict, sorted_dict, ascending, ignore_index, output_index", [ ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [1, 2], "M2": [3, 4] }, True, True, [0, 1], ), ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [2, 1], "M2": [4, 3] }, False, True, [0, 1], ), ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [1, 2], "M2": [3, 4] }, True, False, MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")), ), ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [2, 1], "M2": [4, 3] }, False, False, MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")), ), ], ) def test_sort_index_ignore_index_multi_index(self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index): # GH 30114, this is to test ignore_index on MulitIndex of index mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")) df = DataFrame(original_dict, index=mi) expected_df = DataFrame(sorted_dict, index=output_index) kwargs = { "ascending": ascending, "ignore_index": ignore_index, "inplace": inplace, } if inplace: result_df = df.copy() result_df.sort_index(**kwargs) else: result_df = df.sort_index(**kwargs) tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) def test_sort_index_categorical_multiindex(self): # GH#15058 df = DataFrame({ "a": range(6), "l1": pd.Categorical( ["a", "a", "b", "b", "c", "c"], categories=["c", "a", "b"], ordered=True, ), "l2": [0, 1, 0, 1, 0, 1], }) result = df.set_index(["l1", "l2"]).sort_index() expected = DataFrame( [4, 5, 0, 1, 2, 3], columns=["a"], index=MultiIndex( levels=[ CategoricalIndex( ["c", "a", "b"], categories=["c", "a", "b"], ordered=True, name="l1", dtype="category", ), [0, 1], ], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], names=["l1", "l2"], ), ) tm.assert_frame_equal(result, expected) def test_sort_index_and_reconstruction(self): # GH#15622 # lexsortedness should be identical # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list("ab")) expected = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]), ) assert expected.index._is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), ) result = result.sort_index() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = result.sort_index() assert result.index._is_lexsorted() tm.assert_frame_equal(result, expected) concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # GH#14015 df = DataFrame( [[1, 2], [6, 7]], columns=MultiIndex.from_tuples( [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], names=["l1", "Date"], ), ) df.columns = df.columns.set_levels(pd.to_datetime( df.columns.levels[1]), level=1) assert not df.columns.is_monotonic result = df.sort_index(axis=1) assert result.columns.is_monotonic result = df.sort_index(axis=1, level=1) assert result.columns.is_monotonic # TODO: better name, de-duplicate with test_sort_index_level above def test_sort_index_level2(self): mi = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=mi, columns=Index(["A", "B", "C"], name="exp"), ) df = frame.copy() df.index = np.arange(len(df)) # axis=1 # series a_sorted = frame["A"].sort_index(level=0) # preserve names assert a_sorted.index.names == frame.index.names # inplace rs = frame.copy() return_value = rs.sort_index(level=0, inplace=True) assert return_value is None tm.assert_frame_equal(rs, frame.sort_index(level=0)) def test_sort_index_level_large_cardinality(self): # GH#2684 (int64) index = MultiIndex.from_arrays([np.arange(4000)] * 3) df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) # it works! result = df.sort_index(level=0) assert result.index._lexsort_depth == 3 # GH#2684 (int32) index = MultiIndex.from_arrays([np.arange(4000)] * 3) df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) # it works! result = df.sort_index(level=0) assert (result.dtypes.values == df.dtypes.values).all() assert result.index._lexsort_depth == 3 def test_sort_index_level_by_name(self): mi = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=mi, columns=Index(["A", "B", "C"], name="exp"), ) frame.index.names = ["first", "second"] result = frame.sort_index(level="second") expected = frame.sort_index(level=1) tm.assert_frame_equal(result, expected) def test_sort_index_level_mixed(self): mi = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=mi, columns=Index(["A", "B", "C"], name="exp"), ) sorted_before = frame.sort_index(level=1) df = frame.copy() df["foo"] = "bar" sorted_after = df.sort_index(level=1) tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = frame.T sorted_before = dft.sort_index(level=1, axis=1) dft["foo", "three"] = "bar" sorted_after = dft.sort_index(level=1, axis=1) tm.assert_frame_equal( sorted_before.drop([("foo", "three")], axis=1), sorted_after.drop([("foo", "three")], axis=1), ) def test_sort_index_preserve_levels(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data result = frame.sort_index() assert result.index.names == frame.index.names @pytest.mark.parametrize( "gen,extra", [ ([1.0, 3.0, 2.0, 5.0], 4.0), ([1, 3, 2, 5], 4), ( [ Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102"), Timestamp("20130105"), ], Timestamp("20130104"), ), (["1one", "3one", "2one", "5one"], "4one"), ], ) def test_sort_index_multilevel_repr_8017(self, gen, extra): np.random.seed(0) data = np.random.randn(3, 4) columns = MultiIndex.from_tuples([("red", i) for i in gen]) df = DataFrame(data, index=list("def"), columns=columns) df2 = pd.concat( [ df, DataFrame( "world", index=list("def"), columns=MultiIndex.from_tuples([("red", extra)]), ), ], axis=1, ) # check that the repr is good # make sure that we have a correct sparsified repr # e.g. only 1 header of read assert str(df2).splitlines()[0].split() == ["red"] # GH 8017 # sorting fails after columns added # construct single-dtype then sort result = df.copy().sort_index(axis=1) expected = df.iloc[:, [0, 2, 1, 3]] tm.assert_frame_equal(result, expected) result = df2.sort_index(axis=1) expected = df2.iloc[:, [0, 2, 1, 4, 3]] tm.assert_frame_equal(result, expected) # setitem then sort result = df.copy() result[("red", extra)] = "world" result = result.sort_index(axis=1) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "categories", [ pytest.param(["a", "b", "c"], id="str"), pytest.param( [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], id="pd.Interval", ), ], ) def test_sort_index_with_categories(self, categories): # GH#23452 df = DataFrame( {"foo": range(len(categories))}, index=CategoricalIndex(data=categories, categories=categories, ordered=True), ) df.index = df.index.reorder_categories(df.index.categories[::-1]) result = df.sort_index() expected = DataFrame( {"foo": reversed(range(len(categories)))}, index=CategoricalIndex(data=categories[::-1], categories=categories[::-1], ordered=True), ) tm.assert_frame_equal(result, expected)
def test_to_html_index(self): index = ['foo', 'bar', 'baz'] df = DataFrame( { 'A': [1, 2, 3], 'B': [1.2, 3.4, 5.6], 'C': ['one', 'two', np.nan] }, columns=['A', 'B', 'C'], index=index) expected_with_index = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr style="text-align: right;">\n' ' <th></th>\n' ' <th>A</th>\n' ' <th>B</th>\n' ' <th>C</th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <th>foo</th>\n' ' <td>1</td>\n' ' <td>1.2</td>\n' ' <td>one</td>\n' ' </tr>\n' ' <tr>\n' ' <th>bar</th>\n' ' <td>2</td>\n' ' <td>3.4</td>\n' ' <td>two</td>\n' ' </tr>\n' ' <tr>\n' ' <th>baz</th>\n' ' <td>3</td>\n' ' <td>5.6</td>\n' ' <td>NaN</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') assert df.to_html() == expected_with_index expected_without_index = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr style="text-align: right;">\n' ' <th>A</th>\n' ' <th>B</th>\n' ' <th>C</th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <td>1</td>\n' ' <td>1.2</td>\n' ' <td>one</td>\n' ' </tr>\n' ' <tr>\n' ' <td>2</td>\n' ' <td>3.4</td>\n' ' <td>two</td>\n' ' </tr>\n' ' <tr>\n' ' <td>3</td>\n' ' <td>5.6</td>\n' ' <td>NaN</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') result = df.to_html(index=False) for i in index: assert i not in result assert result == expected_without_index df.index = Index(['foo', 'bar', 'baz'], name='idx') expected_with_index = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr style="text-align: right;">\n' ' <th></th>\n' ' <th>A</th>\n' ' <th>B</th>\n' ' <th>C</th>\n' ' </tr>\n' ' <tr>\n' ' <th>idx</th>\n' ' <th></th>\n' ' <th></th>\n' ' <th></th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <th>foo</th>\n' ' <td>1</td>\n' ' <td>1.2</td>\n' ' <td>one</td>\n' ' </tr>\n' ' <tr>\n' ' <th>bar</th>\n' ' <td>2</td>\n' ' <td>3.4</td>\n' ' <td>two</td>\n' ' </tr>\n' ' <tr>\n' ' <th>baz</th>\n' ' <td>3</td>\n' ' <td>5.6</td>\n' ' <td>NaN</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') assert df.to_html() == expected_with_index assert df.to_html(index=False) == expected_without_index tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] df.index = MultiIndex.from_tuples(tuples) expected_with_index = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr style="text-align: right;">\n' ' <th></th>\n' ' <th></th>\n' ' <th>A</th>\n' ' <th>B</th>\n' ' <th>C</th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <th rowspan="2" valign="top">foo</th>\n' ' <th>car</th>\n' ' <td>1</td>\n' ' <td>1.2</td>\n' ' <td>one</td>\n' ' </tr>\n' ' <tr>\n' ' <th>bike</th>\n' ' <td>2</td>\n' ' <td>3.4</td>\n' ' <td>two</td>\n' ' </tr>\n' ' <tr>\n' ' <th>bar</th>\n' ' <th>car</th>\n' ' <td>3</td>\n' ' <td>5.6</td>\n' ' <td>NaN</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') assert df.to_html() == expected_with_index result = df.to_html(index=False) for i in ['foo', 'bar', 'car', 'bike']: assert i not in result # must be the same result as normal index assert result == expected_without_index df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2']) expected_with_index = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr style="text-align: right;">\n' ' <th></th>\n' ' <th></th>\n' ' <th>A</th>\n' ' <th>B</th>\n' ' <th>C</th>\n' ' </tr>\n' ' <tr>\n' ' <th>idx1</th>\n' ' <th>idx2</th>\n' ' <th></th>\n' ' <th></th>\n' ' <th></th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <th rowspan="2" valign="top">foo</th>\n' ' <th>car</th>\n' ' <td>1</td>\n' ' <td>1.2</td>\n' ' <td>one</td>\n' ' </tr>\n' ' <tr>\n' ' <th>bike</th>\n' ' <td>2</td>\n' ' <td>3.4</td>\n' ' <td>two</td>\n' ' </tr>\n' ' <tr>\n' ' <th>bar</th>\n' ' <th>car</th>\n' ' <td>3</td>\n' ' <td>5.6</td>\n' ' <td>NaN</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') assert df.to_html() == expected_with_index assert df.to_html(index=False) == expected_without_index
def test_to_html_multiindex(self): columns = MultiIndex.from_tuples(list( zip(np.arange(2).repeat(2), np.mod(lrange(4), 2))), names=['CL0', 'CL1']) df = DataFrame([list('abcd'), list('efgh')], columns=columns) result = df.to_html(justify='left') expected = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr>\n' ' <th>CL0</th>\n' ' <th colspan="2" halign="left">0</th>\n' ' <th colspan="2" halign="left">1</th>\n' ' </tr>\n' ' <tr>\n' ' <th>CL1</th>\n' ' <th>0</th>\n' ' <th>1</th>\n' ' <th>0</th>\n' ' <th>1</th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <th>0</th>\n' ' <td>a</td>\n' ' <td>b</td>\n' ' <td>c</td>\n' ' <td>d</td>\n' ' </tr>\n' ' <tr>\n' ' <th>1</th>\n' ' <td>e</td>\n' ' <td>f</td>\n' ' <td>g</td>\n' ' <td>h</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') self.assertEqual(result, expected) columns = MultiIndex.from_tuples( list(zip(range(4), np.mod(lrange(4), 2)))) df = DataFrame([list('abcd'), list('efgh')], columns=columns) result = df.to_html(justify='right') expected = ('<table border="1" class="dataframe">\n' ' <thead>\n' ' <tr>\n' ' <th></th>\n' ' <th>0</th>\n' ' <th>1</th>\n' ' <th>2</th>\n' ' <th>3</th>\n' ' </tr>\n' ' <tr>\n' ' <th></th>\n' ' <th>0</th>\n' ' <th>1</th>\n' ' <th>0</th>\n' ' <th>1</th>\n' ' </tr>\n' ' </thead>\n' ' <tbody>\n' ' <tr>\n' ' <th>0</th>\n' ' <td>a</td>\n' ' <td>b</td>\n' ' <td>c</td>\n' ' <td>d</td>\n' ' </tr>\n' ' <tr>\n' ' <th>1</th>\n' ' <td>e</td>\n' ' <td>f</td>\n' ' <td>g</td>\n' ' <td>h</td>\n' ' </tr>\n' ' </tbody>\n' '</table>') self.assertEqual(result, expected)
def test_rename_multiindex(self): tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) # # without specifying level -> across all levels renamed = df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) new_index = MultiIndex.from_tuples( [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"] ) new_columns = MultiIndex.from_tuples( [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] ) tm.assert_index_equal(renamed.index, new_index) tm.assert_index_equal(renamed.columns, new_columns) assert renamed.index.names == df.index.names assert renamed.columns.names == df.columns.names # # with specifying a level (GH13766) # dict new_columns = MultiIndex.from_tuples( [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] ) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") tm.assert_index_equal(renamed.columns, new_columns) new_columns = MultiIndex.from_tuples( [("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] ) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") tm.assert_index_equal(renamed.columns, new_columns) # function func = str.upper new_columns = MultiIndex.from_tuples( [("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"] ) renamed = df.rename(columns=func, level=0) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns=func, level="fizz") tm.assert_index_equal(renamed.columns, new_columns) new_columns = MultiIndex.from_tuples( [("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"] ) renamed = df.rename(columns=func, level=1) tm.assert_index_equal(renamed.columns, new_columns) renamed = df.rename(columns=func, level="buzz") tm.assert_index_equal(renamed.columns, new_columns) # index new_index = MultiIndex.from_tuples( [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] ) renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index)
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product( [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)]) df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if ( a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]] tm.assert_frame_equal(result, expected) expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C2" or c == "C3")]] result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ["foo"])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc["A", "a"] expected = DataFrame( { "bar": [1, 5, 9], "foo": [0, 4, 8] }, index=Index([1, 2, 3], name="two"), columns=Index(["bar", "foo"], name="lvl1"), ) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.to_numpy())), index=ix) result = s.loc["A1":"A3", :, ["C1", "C3"]] expected = s.loc[[( a, b, c, d, ) for a, b, c, d in s.index.values if ( a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) msg = ("cannot index with a boolean indexer " "that is not the same length as the index") with pytest.raises(ValueError, match=msg): df.loc[(slice(None), np.array([True, False])), :] with pytest.raises(KeyError, match=r"\[1\] not in index"): # slice(None) is on the index, [1] is on the columns, but 1 is # not in the columns, so we raise # This used to treat [1] as positional GH#16396 df.loc[slice(None), [1]] result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted assert df.index._lexsort_depth == 2 df = df.sort_index(level=1, axis=0) assert df.index._lexsort_depth == 0 msg = ("MultiIndex slicing requires the index to be " r"lexsorted: slicing on levels \[1\], lexsort depth 0") with pytest.raises(UnsortedIndexError, match=msg): df.loc[(slice(None), slice("bar")), :] # GH 16734: not sorted, but no real slicing result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :])
def test_where_array_like(klass): i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) cond = [False, True] msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): i.where(klass(cond))
def test_frame_select_complex2(setup_path): with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: pp, hh = paths # use non-trivial selection criteria parms = DataFrame({"A": [1, 1, 2, 2, 3]}) parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) selection = read_hdf(pp, "df", where="A=[2,3]") hist = DataFrame( np.random.randn(25, 1), columns=["data"], index=MultiIndex.from_tuples([(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]), ) hist.to_hdf(hh, "df", mode="w", format="table") expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") # scope with list like l = selection.index.tolist() # noqa store = HDFStore(hh) result = store.select("df", where="l1=l") tm.assert_frame_equal(result, expected) store.close() result = read_hdf(hh, "df", where="l1=l") tm.assert_frame_equal(result, expected) # index index = selection.index # noqa result = read_hdf(hh, "df", where="l1=index") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=selection.index") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=selection.index.tolist()") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=list(selection.index)") tm.assert_frame_equal(result, expected) # scope with index store = HDFStore(hh) result = store.select("df", where="l1=index") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=selection.index") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=selection.index.tolist()") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=list(selection.index)") tm.assert_frame_equal(result, expected) store.close()
def test_construction_list_tuples_nan(self, na_value, vtype): # GH#18505 : valid tuples containing NaN values = [(1, "two"), (3.0, na_value)] result = Index(vtype(values)) expected = MultiIndex.from_tuples(values) tm.assert_index_equal(result, expected)
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get("typ") if typ is None: return obj elif typ == "timestamp": freq = obj["freq"] if "freq" in obj else obj["offset"] return Timestamp(obj["value"], tz=obj["tz"], freq=freq) elif typ == "nat": return NaT elif typ == "period": return Period(ordinal=obj["ordinal"], freq=obj["freq"]) elif typ == "index": dtype = dtype_for(obj["dtype"]) data = unconvert(obj["data"], dtype, obj.get("compress")) return Index(data, dtype=dtype, name=obj["name"]) elif typ == "range_index": return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"]) elif typ == "multi_index": dtype = dtype_for(obj["dtype"]) data = unconvert(obj["data"], dtype, obj.get("compress")) data = [tuple(x) for x in data] return MultiIndex.from_tuples(data, names=obj["names"]) elif typ == "period_index": data = unconvert(obj["data"], np.int64, obj.get("compress")) d = dict(name=obj["name"], freq=obj["freq"]) freq = d.pop("freq", None) return PeriodIndex(PeriodArray(data, freq), **d) elif typ == "datetime_index": data = unconvert(obj["data"], np.int64, obj.get("compress")) d = dict(name=obj["name"], freq=obj["freq"]) result = DatetimeIndex(data, **d) tz = obj["tz"] # reverse tz conversion if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) return result elif typ in ("interval_index", "interval_array"): return globals()[obj["klass"]].from_arrays(obj["left"], obj["right"], obj["closed"], name=obj["name"]) elif typ == "category": from_codes = globals()[obj["klass"]].from_codes return from_codes(codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"]) elif typ == "interval": return Interval(obj["left"], obj["right"], obj["closed"]) elif typ == "series": dtype = dtype_for(obj["dtype"]) index = obj["index"] data = unconvert(obj["data"], dtype, obj["compress"]) return Series(data, index=index, dtype=dtype, name=obj["name"]) elif typ == "block_manager": axes = obj["axes"] def create_block(b): values = _safe_reshape( unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"]) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if "locs" in b: placement = b["locs"] else: placement = axes[0].get_indexer(b["items"]) if is_datetime64tz_dtype(b["dtype"]): assert isinstance(values, np.ndarray), type(values) assert values.dtype == "M8[ns]", values.dtype values = DatetimeArray(values, dtype=b["dtype"]) return make_block( values=values, klass=getattr(internals, b["klass"]), placement=placement, dtype=b["dtype"], ) blocks = [create_block(b) for b in obj["blocks"]] return globals()[obj["klass"]](BlockManager(blocks, axes)) elif typ == "datetime": return parse(obj["data"]) elif typ == "datetime64": return np.datetime64(parse(obj["data"])) elif typ == "date": return parse(obj["data"]).date() elif typ == "timedelta": return timedelta(*obj["data"]) elif typ == "timedelta64": return np.timedelta64(int(obj["data"])) elif typ == "block_index": return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"]) elif typ == "int_index": return globals()[obj["klass"]](obj["length"], obj["indices"]) elif typ == "ndarray": return unconvert(obj["data"], np.typeDict[obj["dtype"]], obj.get("compress")).reshape(obj["shape"]) elif typ == "np_scalar": if obj.get("sub_typ") == "np_complex": return c2f(obj["real"], obj["imag"], obj["dtype"]) else: dtype = dtype_for(obj["dtype"]) try: return dtype(obj["data"]) except (ValueError, TypeError): return dtype.type(obj["data"]) elif typ == "np_complex": return complex(obj["real"] + "+" + obj["imag"] + "j") elif isinstance(obj, (dict, list, set)): return obj else: return obj
def test_rename(self): mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'} renamed = self.frame.rename(columns=mapping) renamed2 = self.frame.rename(columns=str.lower) assert_frame_equal(renamed, renamed2) assert_frame_equal(renamed2.rename(columns=str.upper), self.frame, check_names=False) # index data = {'A': {'foo': 0, 'bar': 1}} # gets sorted alphabetical df = DataFrame(data) renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) self.assert_numpy_array_equal(renamed.index, ['foo', 'bar']) renamed = df.rename(index=str.upper) self.assert_numpy_array_equal(renamed.index, ['BAR', 'FOO']) # have to pass something self.assertRaises(TypeError, self.frame.rename) # partial columns renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'}) self.assert_numpy_array_equal(renamed.columns, ['A', 'B', 'foo', 'bar']) # other axis renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) self.assert_numpy_array_equal(renamed.index, ['A', 'B', 'foo', 'bar']) # index with name index = Index(['foo', 'bar'], name='name') renamer = DataFrame(data, index=index) renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) self.assert_numpy_array_equal(renamed.index, ['bar', 'foo']) self.assertEqual(renamed.index.name, renamer.index.name) # MultiIndex tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) columns = MultiIndex.from_tuples(tuples_columns, names=['fizz', 'buzz']) renamer = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) renamed = renamer.rename(index={ 'foo1': 'foo3', 'bar2': 'bar3' }, columns={ 'fizz1': 'fizz3', 'buzz2': 'buzz3' }) new_index = MultiIndex.from_tuples([('foo3', 'bar1'), ('foo2', 'bar3')]) new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), ('fizz2', 'buzz3')]) self.assert_numpy_array_equal(renamed.index, new_index) self.assert_numpy_array_equal(renamed.columns, new_columns) self.assertEqual(renamed.index.names, renamer.index.names) self.assertEqual(renamed.columns.names, renamer.columns.names)
def test_difference(idx, sort): first = idx result = first.difference(idx[-3:], sort=sort) vals = idx[:-3].values if sort is None: vals = sorted(vals) expected = MultiIndex.from_tuples(vals, sortorder=0, names=idx.names) assert isinstance(result, MultiIndex) assert result.equals(expected) assert result.names == idx.names tm.assert_index_equal(result, expected) # empty difference: reflexive result = idx.difference(idx, sort=sort) expected = idx[:0] assert result.equals(expected) assert result.names == idx.names # empty difference: superset result = idx[-3:].difference(idx, sort=sort) expected = idx[:0] assert result.equals(expected) assert result.names == idx.names # empty difference: degenerate result = idx[:0].difference(idx, sort=sort) expected = idx[:0] assert result.equals(expected) assert result.names == idx.names # names not the same chunklet = idx[-3:] chunklet.names = ["foo", "baz"] result = first.difference(chunklet, sort=sort) assert result.names == (None, None) # empty, but non-equal result = idx.difference(idx.sortlevel(1)[0], sort=sort) assert len(result) == 0 # raise Exception called with non-MultiIndex result = first.difference(first.values, sort=sort) assert result.equals(first[:0]) # name from empty array result = first.difference([], sort=sort) assert first.equals(result) assert first.names == result.names # name from non-empty array result = first.difference([("foo", "one")], sort=sort) expected = pd.MultiIndex.from_tuples( [("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two")] ) expected.names = first.names assert first.names == result.names msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): first.difference([1, 2, 3, 4, 5], sort=sort)
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: tm.assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = list(range(2)) df = DataFrame(arr, columns=["A", "A"]) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range("20130101", periods=4, freq="Q-NOV") df = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"] ) df.columns = idx expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["foo", "bar", "foo", "hello"], ) df["string"] = "bah" expected = DataFrame( [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]], columns=["foo", "bar", "foo", "hello", "string"], ) check(df, expected) with pytest.raises(ValueError, match="Length of value"): df.insert(0, "AnotherColumn", range(len(df.index) - 1)) # insert same dtype df["foo2"] = 3 expected = DataFrame( [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]], columns=["foo", "bar", "foo", "hello", "string", "foo2"], ) check(df, expected) # set (non-dup) df["foo2"] = 4 expected = DataFrame( [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]], columns=["foo", "bar", "foo", "hello", "string", "foo2"], ) check(df, expected) df["foo2"] = 3 # delete (non dup) del df["bar"] expected = DataFrame( [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]], columns=["foo", "foo", "hello", "string", "foo2"], ) check(df, expected) # try to delete again (its not consolidated) del df["hello"] expected = DataFrame( [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"], ) check(df, expected) # consolidate df = df._consolidate() expected = DataFrame( [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], columns=["foo", "foo", "string", "foo2"], ) check(df, expected) # insert df.insert(2, "new_col", 5.0) expected = DataFrame( [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]], columns=["foo", "foo", "new_col", "string", "foo2"], ) check(df, expected) # insert a dup with pytest.raises(ValueError, match="cannot insert"): df.insert(2, "new_col", 4.0) df.insert(2, "new_col", 4.0, allow_duplicates=True) expected = DataFrame( [ [1, 1, 4.0, 5.0, "bah", 3], [1, 2, 4.0, 5.0, "bah", 3], [2, 3, 4.0, 5.0, "bah", 3], ], columns=["foo", "foo", "new_col", "new_col", "string", "foo2"], ) check(df, expected) # delete (dup) del df["foo"] expected = DataFrame( [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]], columns=["new_col", "new_col", "string", "foo2"], ) tm.assert_frame_equal(df, expected) # dup across dtypes df = DataFrame( [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], columns=["foo", "bar", "foo", "hello"], ) check(df) df["foo2"] = 7.0 expected = DataFrame( [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]], columns=["foo", "bar", "foo", "hello", "foo2"], ) check(df, expected) result = df["foo"] expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"]) check(result, expected) # multiple replacements df["foo"] = "string" expected = DataFrame( [ ["string", 1, "string", 5, 7.0], ["string", 1, "string", 5, 7.0], ["string", 1, "string", 5, 7.0], ], columns=["foo", "bar", "foo", "hello", "foo2"], ) check(df, expected) del df["foo"] expected = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"] ) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]}, index=MultiIndex.from_tuples( [(600809, 20130331)], names=["STK_ID", "RPT_Date"] ), ) df5 = DataFrame( { "RPT_Date": [20120930, 20121231, 20130331], "STK_ID": [600809] * 3, "STK_Name": ["饡驦", "饡驦", "饡驦"], "TClose": [38.05, 41.66, 30.01], }, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=["STK_ID", "RPT_Date"], ), ) k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True) result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"}) str(result) result.dtypes expected = DataFrame( [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], columns=[ "RT", "TClose", "TExg", "RPT_Date", "STK_ID", "STK_Name", "QT_Close", ], ).set_index(["STK_ID", "RPT_Date"], drop=False) tm.assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] ) msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df.reindex(columns=["bar"]) with pytest.raises(ValueError, match=msg): df.reindex(columns=["bar", "foo"]) # drop df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] ) result = df.drop(["a"], axis=1) expected = DataFrame([[1], [1], [1]], columns=["bar"]) check(result, expected) result = df.drop("a", axis=1) check(result, expected) # describe df = DataFrame( [[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["bar", "a", "a"], dtype="float64", ) result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame( np.random.randn(5, 3), index=["a", "b", "c", "d", "e"], columns=["A", "B", "A"], ) for index in [df.index, pd.Index(list("edcba"))]: this_df = df.copy() expected_ser = Series(index.values, index=this_df.index) expected_df = DataFrame( {"A": expected_ser, "B": this_df["B"], "A": expected_ser}, columns=["A", "B", "A"], ) this_df["A"] = index check(this_df, expected_df) # operations for op in ["__add__", "__mul__", "__sub__", "__truediv__"]: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ["A", "A"] df.columns = ["A", "A"] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=["that", "that"]) expected = DataFrame(1.0, index=range(5), columns=["that", "that"]) df["that"] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=["that", "that"]) expected = DataFrame(1, index=range(5), columns=["that", "that"]) df["that"] = 1 check(df, expected)
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with pytest.raises(ValueError, match='Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df._consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup with pytest.raises(ValueError, match='cannot insert'): df.insert(2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame([[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame([['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ 'bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( {'RT': [0.0454], 'TClose': [22.02], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331], 'STK_ID': [600809] * 3, 'STK_Name': ['饡驦', '饡驦', '饡驦'], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename( columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) str(result) result.dtypes expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, '饡驦', 30.01]], columns=['RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close']) .set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df.reindex(columns=['bar']) with pytest.raises(ValueError, match=msg): df.reindex(columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame({'A': expected_ser, 'B': this_df['B'], 'A': expected_ser}, columns=['A', 'B', 'A']) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)
class TestDataFrameConvertTo(TestData): def test_to_dict_timestamp(self): # GH11247 # split/records producing np.datetime64 rather than Timestamps # on datetime64[ns] dtypes only tsmp = Timestamp("20130101") test_data = DataFrame({"A": [tsmp, tsmp], "B": [tsmp, tsmp]}) test_data_mixed = DataFrame({"A": [tsmp, tsmp], "B": [1, 2]}) expected_records = [{"A": tsmp, "B": tsmp}, {"A": tsmp, "B": tsmp}] expected_records_mixed = [{"A": tsmp, "B": 1}, {"A": tsmp, "B": 2}] assert test_data.to_dict(orient="records") == expected_records assert test_data_mixed.to_dict( orient="records") == expected_records_mixed expected_series = { "A": Series([tsmp, tsmp], name="A"), "B": Series([tsmp, tsmp], name="B"), } expected_series_mixed = { "A": Series([tsmp, tsmp], name="A"), "B": Series([1, 2], name="B"), } tm.assert_dict_equal(test_data.to_dict(orient="series"), expected_series) tm.assert_dict_equal(test_data_mixed.to_dict(orient="series"), expected_series_mixed) expected_split = { "index": [0, 1], "data": [[tsmp, tsmp], [tsmp, tsmp]], "columns": ["A", "B"], } expected_split_mixed = { "index": [0, 1], "data": [[tsmp, 1], [tsmp, 2]], "columns": ["A", "B"], } tm.assert_dict_equal(test_data.to_dict(orient="split"), expected_split) tm.assert_dict_equal(test_data_mixed.to_dict(orient="split"), expected_split_mixed) def test_to_dict_index_not_unique_with_index_orient(self): # GH22801 # Data loss when indexes are not unique. Raise ValueError. df = DataFrame({"a": [1, 2], "b": [0.5, 0.75]}, index=["A", "A"]) msg = "DataFrame index must be unique for orient='index'" with pytest.raises(ValueError, match=msg): df.to_dict(orient="index") def test_to_dict_invalid_orient(self): df = DataFrame({"A": [0, 1]}) msg = "orient 'xinvalid' not understood" with pytest.raises(ValueError, match=msg): df.to_dict(orient="xinvalid") def test_to_records_dt64(self): df = DataFrame( [["one", "two", "three"], ["four", "five", "six"]], index=date_range("2012-01-01", "2012-01-02"), ) # convert_datetime64 defaults to None expected = df.index.values[0] result = df.to_records()["index"][0] assert expected == result # check for FutureWarning if convert_datetime64=False is passed with tm.assert_produces_warning(FutureWarning): expected = df.index.values[0] result = df.to_records(convert_datetime64=False)["index"][0] assert expected == result # check for FutureWarning if convert_datetime64=True is passed with tm.assert_produces_warning(FutureWarning): expected = df.index[0] result = df.to_records(convert_datetime64=True)["index"][0] assert expected == result def test_to_records_with_multindex(self): # GH3189 index = [ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] data = np.zeros((8, 4)) df = DataFrame(data, index=index) r = df.to_records(index=True)["level_0"] assert "bar" in r assert "one" not in r def test_to_records_with_Mapping_type(self): import email from email.parser import Parser abc.Mapping.register(email.message.Message) headers = Parser().parsestr("From: <*****@*****.**>\n" "To: <*****@*****.**>\n" "Subject: Test message\n" "\n" "Body would go here\n") frame = DataFrame.from_records([headers]) all(x in frame for x in ["Type", "Subject", "From"]) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) df.to_records() def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = "X" rs = df.to_records() assert "X" in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() assert "index" in rs.dtype.fields df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) df.index.names = ["A", None] rs = df.to_records() assert "level_0" in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): # xref issue: https://github.com/numpy/numpy/issues/2407 # Issue #11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() # Note that numpy allows for unicode field names but dtypes need # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], dtype={ "names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"] }, ) tm.assert_almost_equal(result, expected) def test_to_records_with_categorical(self): # GH8626 # dict creation df = DataFrame({"A": list("abc")}, dtype="category") expected = Series(list("abc"), dtype="category", name="A") tm.assert_series_equal(df["A"], expected) # list-like creation df = DataFrame(list("abc"), dtype="category") expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() expected = np.rec.array([(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")]) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "kwargs,expected", [ # No dtypes --> default to array dtypes. ( dict(), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Should have no effect in this case. ( dict(index=True), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "<i8"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Column dtype applied across the board. Index unaffected. ( dict(column_dtypes="<U4"), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U4"), ("B", "<U4"), ("C", "<U4")], ), ), # Index dtype applied across the board. Columns unaffected. ( dict(index_dtypes="<U1"), np.rec.array( [("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")], dtype=[("index", "<U1"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Pass in a type instance. ( dict(column_dtypes=np.unicode), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")], ), ), # Pass in a dtype instance. ( dict(column_dtypes=np.dtype("unicode")), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "<U"), ("B", "<U"), ("C", "<U")], ), ), # Pass in a dictionary (name-only). ( dict(column_dtypes={ "A": np.int8, "B": np.float32, "C": "<U2" }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "<U2")], ), ), # Pass in a dictionary (indices-only). ( dict(index_dtypes={0: "int16"}), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Ignore index mappings if index is not True. ( dict(index=False, index_dtypes="<U2"), np.rec.array( [(1, 0.2, "a"), (2, 1.5, "bc")], dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Non-existent names / indices in mapping should not error. ( dict(index_dtypes={ 0: "int16", "not-there": "float32" }), np.rec.array( [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], dtype=[("index", "i2"), ("A", "<i8"), ("B", "<f8"), ("C", "O")], ), ), # Names / indices not in mapping default to array dtype. ( dict(column_dtypes={ "A": np.int8, "B": np.float32 }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Names / indices not in dtype mapping default to array dtype. ( dict(column_dtypes={ "A": np.dtype("int8"), "B": np.dtype("float32") }), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<i8"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Mixture of everything. ( dict(column_dtypes={ "A": np.int8, "B": np.float32 }, index_dtypes="<U2"), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ), ), # Invalid dype values. ( dict(index=False, column_dtypes=list()), (ValueError, "Invalid dtype \\[\\] specified for column A"), ), ( dict(index=False, column_dtypes={ "A": "int32", "B": 5 }), (ValueError, "Invalid dtype 5 specified for column B"), ), # Numpy can't handle EA types, so check error is raised ( dict( index=False, column_dtypes={ "A": "int32", "B": CategoricalDtype(["a", "b"]) }, ), (ValueError, "Invalid dtype category specified for column B"), ), # Check that bad types raise ( dict(index=False, column_dtypes={ "A": "int32", "B": "foo" }), (TypeError, 'data type "foo" not understood'), ), ], ) @pytest.mark.skipif(not is_platform_little_endian(), reason="expected values assume little-endian") def test_to_records_dtype(self, kwargs, expected): # see gh-18146 df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) if not isinstance(expected, np.recarray): with pytest.raises(expected[0], match=expected[1]): df.to_records(**kwargs) else: result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( "df,kwargs,expected", [ # MultiIndex in the index. ( DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=list("abc")).set_index(["a", "b"]), dict(column_dtypes="float64", index_dtypes={ 0: "int32", 1: "int8" }), np.rec.array( [(1, 2, 3.0), (4, 5, 6.0), (7, 8, 9.0)], dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")], ), ), # MultiIndex in the columns. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")]), ), dict(column_dtypes={ 0: "<U1", 2: "float32" }, index_dtypes="float32"), np.rec.array( [(0.0, "1", 2, 3.0), (1.0, "4", 5, 6.0), (2.0, "7", 8, 9.0)], dtype=[ ("index", "<f4"), ("('a', 'd')", "<U1"), ("('b', 'e')", "<i8"), ("('c', 'f')", "<f4"), ], ), ), # MultiIndex in both the columns and index. ( DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")), index=MultiIndex.from_tuples([("d", -4), ("d", -5), ("f", -6)], names=list("cd")), ), dict(column_dtypes="float64", index_dtypes={ 0: "<U2", 1: "int8" }), np.rec.array( [ ("d", -4, 1.0, 2.0, 3.0), ("d", -5, 4.0, 5.0, 6.0), ("f", -6, 7, 8, 9.0), ], dtype=[ ("c", "<U2"), ("d", "i1"), ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"), ("('c', 'f')", "<f8"), ], ), ), ], ) @pytest.mark.skipif(not is_platform_little_endian(), reason="expected values assume little-endian") def test_to_records_dtype_mi(self, df, kwargs, expected): # see gh-18146 result = df.to_records(**kwargs) tm.assert_almost_equal(result, expected) @pytest.mark.skipif(not is_platform_little_endian(), reason="expected values assume little-endian") def test_to_records_dict_like(self): # see gh-18146 class DictLike: def __init__(self, **kwargs): self.d = kwargs.copy() def __getitem__(self, key): return self.d.__getitem__(key) def __contains__(self, key): return key in self.d def keys(self): return self.d.keys() df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]}) dtype_mappings = dict( column_dtypes=DictLike(**{ "A": np.int8, "B": np.float32 }), index_dtypes="<U2", ) result = df.to_records(**dtype_mappings) expected = np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "<U2"), ("A", "i1"), ("B", "<f4"), ("C", "O")], ) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("mapping", [dict, defaultdict(list), OrderedDict]) def test_to_dict(self, mapping): test_data = { "A": { "1": 1, "2": 2 }, "B": { "1": "1", "2": "2", "3": "3" } } # GH16122 recons_data = DataFrame(test_data).to_dict(into=mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] recons_data = DataFrame(test_data).to_dict("l", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][int(k2) - 1] recons_data = DataFrame(test_data).to_dict("s", mapping) for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k][k2] recons_data = DataFrame(test_data).to_dict("sp", mapping) expected_split = { "columns": ["A", "B"], "index": ["1", "2", "3"], "data": [[1.0, "1"], [2.0, "2"], [np.nan, "3"]], } tm.assert_dict_equal(recons_data, expected_split) recons_data = DataFrame(test_data).to_dict("r", mapping) expected_records = [ { "A": 1.0, "B": "1" }, { "A": 2.0, "B": "2" }, { "A": np.nan, "B": "3" }, ] assert isinstance(recons_data, list) assert len(recons_data) == 3 for l, r in zip(recons_data, expected_records): tm.assert_dict_equal(l, r) # GH10844 recons_data = DataFrame(test_data).to_dict("i") for k, v in test_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k2][k] df = DataFrame(test_data) df["duped"] = df[df.columns[0]] recons_data = df.to_dict("i") comp_data = test_data.copy() comp_data["duped"] = comp_data[df.columns[0]] for k, v in comp_data.items(): for k2, v2 in v.items(): assert v2 == recons_data[k2][k] @pytest.mark.parametrize("mapping", [list, defaultdict, []]) def test_to_dict_errors(self, mapping): # GH16122 df = DataFrame(np.random.randn(3, 3)) with pytest.raises(TypeError): df.to_dict(into=mapping) def test_to_dict_not_unique_warning(self): # GH16927: When converting to a dict, if a column has a non-unique name # it will be dropped, throwing a warning. df = DataFrame([[1, 2, 3]], columns=["a", "a", "b"]) with tm.assert_produces_warning(UserWarning): df.to_dict() @pytest.mark.parametrize("tz", ["UTC", "GMT", "US/Eastern"]) def test_to_records_datetimeindex_with_tz(self, tz): # GH13937 dr = date_range("2016-01-01", periods=10, freq="S", tz=tz) df = DataFrame({"datetime": dr}, index=dr) expected = df.to_records() result = df.tz_convert("UTC").to_records() # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) # orient - orient argument to to_dict function # item_getter - function for extracting value from # the resulting dict using column name and index @pytest.mark.parametrize( "orient,item_getter", [ ("dict", lambda d, col, idx: d[col][idx]), ("records", lambda d, col, idx: d[idx][col]), ("list", lambda d, col, idx: d[col][idx]), ("split", lambda d, col, idx: d["data"][idx][d["columns"].index(col)]), ("index", lambda d, col, idx: d[idx][col]), ], ) def test_to_dict_box_scalars(self, orient, item_getter): # 14216, 23753 # make sure that we are boxing properly df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) result = df.to_dict(orient=orient) assert isinstance(item_getter(result, "a", 0), int) assert isinstance(item_getter(result, "b", 0), float) def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [ (datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc), ), (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc), ), ] df = DataFrame(list(data), columns=["d"]) result = df.to_dict(orient="records") expected = [ { "d": Timestamp("2017-11-18 21:53:00.219225+0000", tz=pytz.utc) }, { "d": Timestamp("2017-11-18 22:06:30.061810+0000", tz=pytz.utc) }, ] tm.assert_dict_equal(result[0], expected[0]) tm.assert_dict_equal(result[1], expected[1]) @pytest.mark.parametrize( "into, expected", [ ( dict, { 0: { "int_col": 1, "float_col": 1.0 }, 1: { "int_col": 2, "float_col": 2.0 }, 2: { "int_col": 3, "float_col": 3.0 }, }, ), ( OrderedDict, OrderedDict([ (0, { "int_col": 1, "float_col": 1.0 }), (1, { "int_col": 2, "float_col": 2.0 }), (2, { "int_col": 3, "float_col": 3.0 }), ]), ), ( defaultdict(list), defaultdict( list, { 0: { "int_col": 1, "float_col": 1.0 }, 1: { "int_col": 2, "float_col": 2.0 }, 2: { "int_col": 3, "float_col": 3.0 }, }, ), ), ], ) def test_to_dict_index_dtypes(self, into, expected): # GH 18580 # When using to_dict(orient='index') on a dataframe with int # and float columns only the int columns were cast to float df = DataFrame({"int_col": [1, 2, 3], "float_col": [1.0, 2.0, 3.0]}) result = df.to_dict(orient="index", into=into) cols = ["int_col", "float_col"] result = DataFrame.from_dict(result, orient="index")[cols] expected = DataFrame.from_dict(expected, orient="index")[cols] tm.assert_frame_equal(result, expected) def test_to_dict_numeric_names(self): # https://github.com/pandas-dev/pandas/issues/24940 df = DataFrame({str(i): [i] for i in range(5)}) result = set(df.to_dict("records")[0].keys()) expected = set(df.columns) assert result == expected def test_to_dict_wide(self): # https://github.com/pandas-dev/pandas/issues/24939 df = DataFrame({("A_{:d}".format(i)): [i] for i in range(256)}) result = df.to_dict("records")[0] expected = {"A_{:d}".format(i): i for i in range(256)} assert result == expected
def test_rename_mi(self): df = DataFrame([11, 21, 31], index=MultiIndex.from_tuples([ ("A", x) for x in ["a", "B", "c"] ])) df.rename(str.lower)
def makeCustomIndex(nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None): """ Create an index/multindex with given dimensions, levels, names, etc' nentries - number of entries in index nlevels - number of levels (> 1 produces multindex) prefix - a string prefix for labels names - (Optional), bool or list of strings. if True will use default names, if false will use no names, if a list is given, the name of each level in the index will be taken from the list. ndupe_l - (Optional), list of ints, the number of rows for which the label will repeated at the corresponding level, you can specify just the first few, the rest will use the default ndupe_l of 1. len(ndupe_l) <= nlevels. idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". If idx_type is not None, `idx_nlevels` must be 1. "i"/"f" creates an integer/float index, "s"/"u" creates a string/unicode index "dt" create a datetime index. "td" create a datetime index. if unspecified, string labels will be generated. """ if ndupe_l is None: ndupe_l = [1] * nlevels assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels assert names is None or names is False or names is True or len( names) is nlevels assert idx_type is None or (idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1) if names is True: # build default names names = [prefix + str(i) for i in range(nlevels)] if names is False: # pass None to index constructor for no name names = None # make singleton case uniform if isinstance(names, str) and nlevels == 1: names = [names] # specific 1D index type requested? idx_func_dict: dict[str, Callable[..., Index]] = { "i": makeIntIndex, "f": makeFloatIndex, "s": makeStringIndex, "u": makeUnicodeIndex, "dt": makeDateIndex, "td": makeTimedeltaIndex, "p": makePeriodIndex, } idx_func = idx_func_dict.get(idx_type) if idx_func: idx = idx_func(nentries) # but we need to fill in the name if names: idx.name = names[0] return idx elif idx_type is not None: raise ValueError( f"{repr(idx_type)} is not a legal value for `idx_type`, " "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'.") if len(ndupe_l) < nlevels: ndupe_l.extend([1] * (nlevels - len(ndupe_l))) assert len(ndupe_l) == nlevels assert all(x > 0 for x in ndupe_l) list_of_lists = [] for i in range(nlevels): def keyfunc(x): import re numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") return [int(num) for num in numeric_tuple] # build a list of lists to create the index from div_factor = nentries // ndupe_l[i] + 1 # Deprecated since version 3.9: collections.Counter now supports []. See PEP 585 # and Generic Alias Type. cnt: Counter[str] = collections.Counter() for j in range(div_factor): label = f"{prefix}_l{i}_g{j}" cnt[label] = ndupe_l[i] # cute Counter trick result = sorted(cnt.elements(), key=keyfunc)[:nentries] list_of_lists.append(result) tuples = list(zip(*list_of_lists)) # convert tuples to index if nentries == 1: # we have a single level of tuples, i.e. a regular Index index = Index(tuples[0], name=names[0]) elif nlevels == 1: name = None if names is None else names[0] index = Index((x[0] for x in tuples), name=name) else: index = MultiIndex.from_tuples(tuples, names=names) return index
def test_per_axis_per_level_setitem(self): # test index maker idx = pd.IndexSlice # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df_orig = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity df = df_orig.copy() df.loc[(slice(None), slice(None)), :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) # index df = df_orig.copy() df.loc[(slice(None), [1]), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, 1] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) # columns df = df_orig.copy() df.loc[:, (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 tm.assert_frame_equal(df, expected) # both df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc["A", "a"] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([[100, 100], [100, 100]], dtype="int64") expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) # not enough values df = df_orig.copy() msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([[100], [100, 100]], dtype="int64") msg = "Must have equal len keys and value when setting with an iterable" with pytest.raises(ValueError, match=msg): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([100, 100, 100, 100], dtype="int64") # with an alignable rhs df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = (df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() rhs.loc[:, ("c", "bah")] = 10 df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected)
def test_per_axis_per_level_doc_examples(self): # test index maker idx = pd.IndexSlice # from indexing.rst / advanced index = MultiIndex.from_product( [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df = DataFrame( np.arange(len(index) * len(columns), dtype="int64").reshape( (len(index), len(columns))), index=index, columns=columns, ) result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if ( a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3")]] tm.assert_frame_equal(result, expected) result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :] expected = df.loc[[( a, b, c, d, ) for a, b, c, d in df.index.values if (c == "C1" or c == "C3")]] tm.assert_frame_equal(result, expected) result = df.loc[idx[:, :, ["C1", "C3"]], :] tm.assert_frame_equal(result, expected) # not sorted msg = ("MultiIndex slicing requires the index to be lexsorted: " r"slicing on levels \[1\], lexsort depth 1") with pytest.raises(UnsortedIndexError, match=msg): df.loc["A1", ("a", slice("foo"))] # GH 16734: not sorted, but no real slicing tm.assert_frame_equal(df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]]) df = df.sort_index(axis=1) # slicing df.loc["A1", (slice(None), "foo")] df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")] # setitem df.loc(axis=0)[:, :, ["C1", "C3"]] = -10
def test_multiindex_unique(): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique is True result = hash_pandas_object(mi) assert result.is_unique is True
def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() assert s1.equals(s2) s1[1] = 99 assert not s1.equals(s2) # NaNs compare as equal s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) s2 = s1.copy() assert s1.equals(s2) s2[0] = 9.9 assert not s1.equals(s2) idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() assert s1.equals(s2) # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) df1["text"] = "the sky is so blue. we could use more chocolate.".split( ) df1["start"] = date_range("2000-1-1", periods=10, freq="T") df1["end"] = date_range("2000-1-1", periods=10, freq="D") df1["diff"] = df1["end"] - df1["start"] df1["bool"] = np.arange(10) % 3 == 0 df1.loc[::2] = np.nan df2 = df1.copy() assert df1["text"].equals(df2["text"]) assert df1["start"].equals(df2["start"]) assert df1["end"].equals(df2["end"]) assert df1["diff"].equals(df2["diff"]) assert df1["bool"].equals(df2["bool"]) assert df1.equals(df2) assert not df1.equals(object) # different dtype different = df1.copy() different["floats"] = different["floats"].astype("float32") assert not df1.equals(different) # different index different_index = -index different = df2.set_index(different_index) assert not df1.equals(different) # different columns different = df2.copy() different.columns = df2.columns[::-1] assert not df1.equals(different) # DatetimeIndex index = pd.date_range("2000-1-1", periods=10, freq="T") df1 = df1.set_index(index) df2 = df1.copy() assert df1.equals(df2) # MultiIndex df3 = df1.set_index(["text"], append=True) df2 = df1.set_index(["text"], append=True) assert df3.equals(df2) df2 = df1.set_index(["floats"], append=True) assert not df3.equals(df2) # NaN in index df3 = df1.set_index(["floats"], append=True) df2 = df1.set_index(["floats"], append=True) assert df3.equals(df2) # GH 8437 a = pd.Series([False, np.nan]) b = pd.Series([False, np.nan]) c = pd.Series(index=range(2), dtype=object) d = c.copy() e = c.copy() f = c.copy() c[:-1] = d[:-1] = e[0] = f[0] = False assert a.equals(a) assert a.equals(b) assert a.equals(c) assert a.equals(d) assert a.equals(e) assert e.equals(f)