def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101')) if LooseVersion(pandas.__version__) >= '0.17.0': scalars['period'] = Period('2012','M') index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), <<<<<<< HEAD <<<<<<< HEAD cat=Series(Categorical(['foo', 'bar', 'baz']))) if LooseVersion(pandas.__version__) >= '0.17.0': series['period'] = Series([Period('2000Q1')] * 5)
def test_pivot_datetime_tz(self): dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00'] df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], 'dt1': dates1, 'dt2': dates2, 'value1': range(6), 'value2': [1, 2] * 3}) df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1') exp_col1 = Index(['value1', 'value1']) exp_col2 = Index(['a', 'b'], name='label') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1']) tm.assert_frame_equal(result, expected) exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean']) exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4, tz='Asia/Tokyo', name='dt2') exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1], [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col) result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'], aggfunc=[np.sum, np.mean]) tm.assert_frame_equal(result, expected)
def test_format_sparse_display(): index = MultiIndex(levels=[[0, 1], [0, 1], [0, 1], [0]], labels=[[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]) result = index.format() assert result[3] == '1 0 0 0'
def test_multiindex_objects(): mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], codes=[[0, 1, 0, 2], [2, 0, 0, 1]], names=["col1", "col2"]) recons = mi._sort_levels_monotonic() # These are equal. assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) equivalency. expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # Values should match, but in different order. tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def test_na_value_dict(self): data = """A,B,C foo,bar,NA bar,foo,foo foo,bar,NA bar,foo,foo""" df = read_csv(StringIO(data), na_values={'A': ['foo'], 'B': ['bar']}) expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], 'B': [np.nan, 'foo', np.nan, 'foo'], 'C': [np.nan, 'foo', np.nan, 'foo']}) assert_frame_equal(df, expected) data = """\ a,b,c,d 0,NA,1,5 """ xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0]) xp.index.name = 'a' df = read_csv(StringIO(data), na_values={}, index_col=0) assert_frame_equal(df, xp) xp = DataFrame({'b': [np.nan], 'd': [5]}, MultiIndex.from_tuples([(0, 1)])) df = read_csv(StringIO(data), na_values={}, index_col=[0, 2]) assert_frame_equal(df, xp) xp = DataFrame({'b': [np.nan], 'd': [5]}, MultiIndex.from_tuples([(0, 1)])) df = read_csv(StringIO(data), na_values={}, index_col=['a', 'c']) assert_frame_equal(df, xp)
def test_from_product_empty(): # 0 levels with tm.assert_raises_regex( ValueError, "Must pass non-zero number of levels/labels"): MultiIndex.from_product([]) # 1 level result = MultiIndex.from_product([[]], names=['A']) expected = pd.Index([], name='A') tm.assert_index_equal(result.levels[0], expected) # 2 levels l1 = [[], ['foo', 'bar', 'baz'], []] l2 = [[], [], ['a', 'b', 'c']] names = ['A', 'B'] for first, second in zip(l1, l2): result = MultiIndex.from_product([first, second], names=names) expected = MultiIndex(levels=[first, second], labels=[[], []], names=names) tm.assert_index_equal(result, expected) # GH12258 names = ['A', 'B', 'C'] for N in range(4): lvl2 = lrange(N) result = MultiIndex.from_product([[], lvl2, []], names=names) expected = MultiIndex(levels=[[], lvl2, []], labels=[[], [], []], names=names) tm.assert_index_equal(result, expected)
def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object( mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object( recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def test_loc_getitem_array(self): # GH15434 # passing an array as a key with a MultiIndex index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) x = Series(index=index, data=range(9), dtype=np.float64) y = np.array([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), dtype=np.float64) result = x.loc[y] tm.assert_series_equal(result, expected) # empty array: empty = np.array([]) expected = Series([], index=MultiIndex( levels=index.levels, labels=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected) # 0-dim array (scalar): scalar = np.int64(1) expected = Series( data=[0, 1, 2], index=['A', 'B', 'C'], dtype=np.float64) result = x.loc[scalar] tm.assert_series_equal(result, expected)
def test_loc_multiindex_incomplete(self): # GH 7399 # incomplete indexers s = Series(np.arange(15, dtype='int64'), MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.loc[:, 'a':'c'] result = s.loc[0:4, 'a':'c'] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) result = s.loc[:4, 'a':'c'] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) result = s.loc[0:, 'a':'c'] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) # GH 7400 # multiindexer gettitem with list of indexers skips wrong element s = Series(np.arange(15, dtype='int64'), MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.iloc[[6, 7, 8, 12, 13, 14]] result = s.loc[2:4:2, 'a':'c'] tm.assert_series_equal(result, expected)
def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) df.index = MultiIndex.from_tuples( [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) # From a mixed type dataframe df['A'] = df['A'].astype(np.int16) df['B'] = df['B'].astype(np.float64) result = df.unstack(fill_value=-1) expected['A'] = expected['A'].astype(np.int16) expected['B'] = expected['B'].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] expected = DataFrame(rows, index=list('xyz'), dtype=np.float) expected.columns = MultiIndex.from_tuples( [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected)
def test_loc_getitem_series(self): # GH14730 # passing a series as a key with a MultiIndex index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) x = Series(index=index, data=range(9), dtype=np.float64) y = Series([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), dtype=np.float64) result = x.loc[y] tm.assert_series_equal(result, expected) result = x.loc[[1, 3]] tm.assert_series_equal(result, expected) # GH15424 y1 = Series([1, 3], index=[1, 2]) result = x.loc[y1] tm.assert_series_equal(result, expected) empty = Series(data=[], dtype=np.float64) expected = Series([], index=MultiIndex( levels=index.levels, labels=[[], []], dtype=np.float64)) result = x.loc[empty] tm.assert_series_equal(result, expected)
def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1, expected_error): # GH 7866 # multi-index slicing with missing indexers idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() if indexer == []: expected = s.iloc[[]] elif is_level1: expected = Series([0, 3, 6], index=MultiIndex.from_product( [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() else: exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = Series(np.arange(3, dtype='int64'), index=exp_idx).sort_index() if expected_error is not None: with pytest.raises(KeyError, match=expected_error): s.loc[indexer] else: result = s.loc[indexer] tm.assert_series_equal(result, expected)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() assert 'value' in df df = ser.reset_index(name='value2') assert 'value2' in df # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) tm.assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 rs = s.reset_index(level=[0, 2], drop=True) tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series)
def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices index = MultiIndex.from_tuples([t for t in itertools.product( [6, 7, 8], ['a', 'b'])]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([t for t in itertools.product( [10, 20, 30], ['a', 'b'])]) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] tm.assert_frame_equal(result, expected) # doc examples result = df.loc[10, :] expected = df.iloc[0:2] expected.index = ['a', 'b'] tm.assert_frame_equal(result, expected) result = df.loc[:, 10] expected = df[10] tm.assert_frame_equal(result, expected)
def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): # GH #19686 # .loc should work with nested indexers which can be # any list-like objects (see `pandas.api.types.is_list_like`) or slices def convert_nested_indexer(indexer_type, keys): if indexer_type == np.ndarray: return np.array(keys) if indexer_type == slice: return slice(*keys) return indexer_type(keys) a = [10, 20, 30] b = [1, 2, 3] index = MultiIndex.from_product([a, b]) df = DataFrame( np.arange(len(index), dtype='int64'), index=index, columns=['Data']) keys = ([10, 20], [2, 3]) types = (indexer_type_1, indexer_type_2) # check indexers with all the combinations of nested objects # of all the valid types indexer = tuple( convert_nested_indexer(indexer_type, k) for indexer_type, k in zip(types, keys)) result = df.loc[indexer, 'Data'] expected = Series( [1, 2, 4, 5], name='Data', index=MultiIndex.from_product(keys)) tm.assert_series_equal(result, expected)
def test_concat_with_group_keys(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) # axis=0 df = DataFrame(np.random.randn(3, 4)) df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1]) exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]]) expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1]) exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) tm.assert_frame_equal(result, expected) # axis=1 df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1], axis=1) expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1], axis=1) expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) tm.assert_frame_equal(result, expected)
def test_append_mixed_dtypes(): # GH 13660 dti = date_range('2011-01-01', freq='M', periods=3, ) dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern') pi = period_range('2011-01', freq='M', periods=3) mi = MultiIndex.from_arrays([[1, 2, 3], [1.1, np.nan, 3.3], ['a', 'b', 'c'], dti, dti_tz, pi]) assert mi.nlevels == 6 res = mi.append(mi) exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3], [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], ['a', 'b', 'c', 'a', 'b', 'c'], dti.append(dti), dti_tz.append(dti_tz), pi.append(pi)]) tm.assert_index_equal(res, exp) other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'], ['x', 'y', 'z'], ['x', 'y', 'z'], ['x', 'y', 'z'], ['x', 'y', 'z']]) res = mi.append(other) exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'], [1.1, np.nan, 3.3, 'x', 'y', 'z'], ['a', 'b', 'c', 'x', 'y', 'z'], dti.append(pd.Index(['x', 'y', 'z'])), dti_tz.append(pd.Index(['x', 'y', 'z'])), pi.append(pd.Index(['x', 'y', 'z']))]) tm.assert_index_equal(res, exp)
def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list('abc'), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical( list('aaa'), categories=['a', 'b'], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({'missing': missing, 'dense': dense, 'values': values}) grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product( [Categorical(['a', 'b'], ordered=ordered), Categorical(['a', 'b', 'c'], ordered=ordered)], names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], names=['missing', 'dense']) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def test_set_index_names(self): df = pd.util.testing.makeDataFrame() df.index.name = 'name' assert df.set_index(df.index).index.names == ['name'] mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, names=['A', 'B', 'C', 'D']) df = df.set_index(['A', 'B']) assert df.set_index(df.index).index.names == ['A', 'B'] # Check that set_index isn't converting a MultiIndex into an Index assert isinstance(df.set_index(df.index).index, MultiIndex) # Check actual equality tm.assert_index_equal(df.set_index(df.index).index, mi) idx2 = df.index.rename(['C', 'D']) # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather # than a pair of tuples assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex) # Check equality tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)
def test_get_info_after_update(chunkstore_lib): df = DataFrame(data={'data': [1.1, 2.1, 3.1]}, index=MultiIndex.from_tuples([(dt(2016, 1, 1), 1), (dt(2016, 1, 2), 1), (dt(2016, 1, 3), 1)], names=['date', 'id']) ) chunkstore_lib.write('test_df', df, 'D') df2 = DataFrame(data={'data': [1.1, 1.1, 1.1]}, index=MultiIndex.from_tuples([(dt(2016, 1, 1), 2), (dt(2016, 1, 2), 2), (dt(2016, 1, 4), 1)], names=['date', 'id']) ) chunkstore_lib.update('test_df', df2) assert_frame_equal(chunkstore_lib.read('test_df'), pd.concat([df, df2]).sort()) info = {'rows': 6, 'dtype': [('date', '<M8[ns]'), ('id', '<i8'), ('data', '<f8')], 'chunk_count': 4, 'col_names': {u'index': [u'date', u'id'], u'index_tz': [None, None], u'columns': [u'data']}, 'type': u'df', 'size': 144} assert(chunkstore_lib.get_info('test_df') == info)
def test_mangles_multi_index(self): # See GH 18062 data = """A,A,A,B\none,one,one,two\n0,40,34,0.1""" df = self.read_csv(StringIO(data), header=[0, 1]) expected = DataFrame([[0, 40, 34, 0.1]], columns=MultiIndex.from_tuples( [('A', 'one'), ('A', 'one.1'), ('A', 'one.2'), ('B', 'two')])) tm.assert_frame_equal(df, expected) data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1""" df = self.read_csv(StringIO(data), header=[0, 1]) expected = DataFrame([[0, 40, 34, 0.1]], columns=MultiIndex.from_tuples( [('A', 'one'), ('A', 'one.1'), ('A', 'one.1.1'), ('B', 'two')])) tm.assert_frame_equal(df, expected) data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1""" df = self.read_csv(StringIO(data), header=[0, 1]) expected = DataFrame([[0, 40, 34, 0.1, 0.1]], columns=MultiIndex.from_tuples( [('A', 'one'), ('A', 'one.1'), ('A', 'one.1.1'), ('B', 'two'), ('B', 'two.1')])) tm.assert_frame_equal(df, expected)
def test_join_multiindex(self): index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names
def test_groupby_as_index_apply(df): # GH #4648 and #3417 df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], 'user_id': [1, 2, 1, 1, 3, 1], 'time': range(6)}) g_as = df.groupby('user_id', as_index=True) g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index res_not_as = g_not_as.head(2).index exp = Index([0, 1, 2, 4]) tm.assert_index_equal(res_as, exp) tm.assert_index_equal(res_not_as, exp) res_as_apply = g_as.apply(lambda x: x.head(2)).index res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( 2, 4)]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) ind = Index(list('abcde')) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index tm.assert_index_equal(res, ind)
def test_set_index_names(self): df = pd.util.testing.makeDataFrame() df.index.name = 'name' self.assertEqual(df.set_index(df.index).index.names, ['name']) mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, names=['A', 'B', 'A', 'B']) df = df.set_index(['A', 'B']) self.assertEqual(df.set_index(df.index).index.names, ['A', 'B']) # Check that set_index isn't converting a MultiIndex into an Index self.assertTrue(isinstance(df.set_index(df.index).index, MultiIndex)) # Check actual equality tm.assert_index_equal(df.set_index(df.index).index, mi) # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather # than a pair of tuples self.assertTrue(isinstance(df.set_index( [df.index, df.index]).index, MultiIndex)) # Check equality tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2)
def test_hash_pandas_object(self): for obj in [Series([1, 2, 3]), Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(['a', 'b', 'c']), Series(['a', np.nan, 'c']), Series(['a', None, 'c']), Series([True, False, True]), Series(), Index([1, 2, 3]), Index([True, False, True]), DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range('20130101', periods=3, tz='US/Eastern')), MultiIndex.from_product( [range(5), ['foo', 'bar', 'baz'], pd.date_range('20130101', periods=2)]), MultiIndex.from_product( [pd.CategoricalIndex(list('aabc')), range(3)])]: self.check_equal(obj) self.check_not_equal_with_index(obj)
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), per=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), B=np.arange(3).astype(np.int64))), mixed_dup=mixed_dup_df) mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A']), mixed_dup=mixed_dup_panel) return dict(series=series, frame=frame, panel=panel, index=index, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()))
def setup_method(self, method): self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) self.series_floats = Series(np.random.rand(4), index=Float64Index(range(0, 8, 2))) self.frame_floats = DataFrame(np.random.randn(4, 4), index=Float64Index(range(0, 8, 2)), columns=Float64Index(range(0, 12, 3))) m_idces = [MultiIndex.from_product([[1, 2], [3, 4]]), MultiIndex.from_product([[5, 6], [7, 8]]), MultiIndex.from_product([[9, 10], [11, 12]])] self.series_multi = Series(np.random.rand(4), index=m_idces[0]) self.frame_multi = DataFrame(np.random.randn(4, 4), index=m_idces[0], columns=m_idces[1]) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4) .sort_values(ascending=False)) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) self.frame_empty = DataFrame() self.series_empty = Series() # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self, '%s_%s' % (o, t), None) setattr(self, o, d)
def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index mi = MultiIndex.from_tuples([ [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) expected_mi = MultiIndex.from_tuples([ [1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list('ABC')) expected = pd.DataFrame([ [5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) assert_frame_equal(result, expected) # sort_remaining=False expected_mi = MultiIndex.from_tuples([ [1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list('ABC')) expected = pd.DataFrame([ [5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected)
def __init__(self, node_name, parents=None, node_domain=None): super(CPT, self).__init__(node_name) if node_domain is None or node_domain.__len__() == 0: self._domain = ['T', 'F'] else: self._domain = node_domain[:] self.m = 1 self.n = self._domain.__len__() if parents is None or parents.__len__() == 0: self.rows = [self._name] self.cols = MultiIndex.from_product([self._domain]) else: parents_names = [] parents_domains = [] for parent in parents: parents_names.append(parent.name) parents_domains.append(parent.domain) self.m = self.m * parent.domain.__len__() self.cols = MultiIndex.from_product([self._domain], names=[self._name]) self.rows = MultiIndex.from_product(parents_domains, names=parents_names) self._values = np.zeros((self.m, self.n)) self._table = DataFrame(self._values, index=self.rows, columns=self.cols)
def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product([list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index() .merge(right.reset_index(), on=['abc', 'xy'], how=join_type) .set_index(['abc', 'xy', 'num']) ) assert_frame_equal(expected, result) msg = (r'len\(left_on\) must equal the number of levels in the index' ' of "right"') with pytest.raises(ValueError, match=msg): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=['abc', 'xy'], how=join_type)
def test_get_indexer_with_missing_value(index_arr, labels, expected): # issue 19132 idx = MultiIndex.from_arrays(index_arr) result = idx.get_indexer(labels) tm.assert_numpy_array_equal(result, expected)
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get(u'typ') if typ is None: return obj elif typ == u'timestamp': freq = obj[u'freq'] if 'freq' in obj else obj[u'offset'] return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq) elif typ == u'nat': return NaT elif typ == u'period': return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq']) elif typ == u'index': dtype = dtype_for(obj[u'dtype']) data = unconvert(obj[u'data'], dtype, obj.get(u'compress')) return Index(data, dtype=dtype, name=obj[u'name']) elif typ == u'range_index': return RangeIndex(obj[u'start'], obj[u'stop'], obj[u'step'], name=obj[u'name']) elif typ == u'multi_index': dtype = dtype_for(obj[u'dtype']) data = unconvert(obj[u'data'], dtype, obj.get(u'compress')) data = [tuple(x) for x in data] return MultiIndex.from_tuples(data, names=obj[u'names']) elif typ == u'period_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) freq = d.pop('freq', None) return PeriodIndex(PeriodArray(data, freq), **d) elif typ == u'datetime_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) result = DatetimeIndex(data, **d) tz = obj[u'tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result elif typ in (u'interval_index', 'interval_array'): return globals()[obj[u'klass']].from_arrays(obj[u'left'], obj[u'right'], obj[u'closed'], name=obj[u'name']) elif typ == u'category': from_codes = globals()[obj[u'klass']].from_codes return from_codes(codes=obj[u'codes'], categories=obj[u'categories'], ordered=obj[u'ordered']) elif typ == u'interval': return Interval(obj[u'left'], obj[u'right'], obj[u'closed']) elif typ == u'series': dtype = dtype_for(obj[u'dtype']) pd_dtype = pandas_dtype(dtype) index = obj[u'index'] result = Series(unconvert(obj[u'data'], dtype, obj[u'compress']), index=index, dtype=pd_dtype, name=obj[u'name']) return result elif typ == u'block_manager': axes = obj[u'axes'] def create_block(b): values = _safe_reshape(unconvert( b[u'values'], dtype_for(b[u'dtype']), b[u'compress']), b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u'locs' in b: placement = b[u'locs'] else: placement = axes[0].get_indexer(b[u'items']) if is_datetime64tz_dtype(b[u'dtype']): assert isinstance(values, np.ndarray), type(values) assert values.dtype == 'M8[ns]', values.dtype values = DatetimeArray(values, dtype=b[u'dtype']) return make_block(values=values, klass=getattr(internals, b[u'klass']), placement=placement, dtype=b[u'dtype']) blocks = [create_block(b) for b in obj[u'blocks']] return globals()[obj[u'klass']](BlockManager(blocks, axes)) elif typ == u'datetime': return parse(obj[u'data']) elif typ == u'datetime64': return np.datetime64(parse(obj[u'data'])) elif typ == u'date': return parse(obj[u'data']).date() elif typ == u'timedelta': return timedelta(*obj[u'data']) elif typ == u'timedelta64': return np.timedelta64(int(obj[u'data'])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return SparseSeries( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return SparseDataFrame( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) # elif typ == 'sparse_panel': # return SparsePanel( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == u'block_index': return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'], obj[u'blengths']) elif typ == u'int_index': return globals()[obj[u'klass']](obj[u'length'], obj[u'indices']) elif typ == u'ndarray': return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']], obj.get(u'compress')).reshape(obj[u'shape']) elif typ == u'np_scalar': if obj.get(u'sub_typ') == u'np_complex': return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype']) else: dtype = dtype_for(obj[u'dtype']) try: return dtype(obj[u'data']) except (ValueError, TypeError): return dtype.type(obj[u'data']) elif typ == u'np_complex': return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j') elif isinstance(obj, (dict, list, set)): return obj else: return obj
def test_join_inner_multiindex(self): key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] key2 = [ "two", "one", "three", "one", "two", "one", "two", "two", "three", "one", ] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) to_join = DataFrame( np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] ) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False, ) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) tm.assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic tm.assert_frame_equal(joined, expected)
def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables)
def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected)
def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): # issue 19132 idx = MultiIndex.from_arrays(index_arr) result = idx.slice_locs(start=start_idx, end=end_idx) assert result == expected
def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): # issue 19132 idx = MultiIndex.from_arrays(index_arr) result = idx.get_slice_bound(target, side=algo, kind="loc") assert result == expected
expected = DataFrame( [[1, 1, 1], [1, 1, 1], [1, 1, 1]], columns=MultiIndex.from_tuples([("a", ), ("b", ), ("c", )], names=[("A", "a")]), index=pd.Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "unstack_idx, expected_values, expected_index, expected_columns", [ ( ("A", "a"), [[1, 1], [1, 1], [1, 1], [1, 1]], MultiIndex.from_tuples([(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"]), MultiIndex.from_tuples([("a", ), ("b", )], names=[("A", "a")]), ), ( (("A", "a"), "B"), [[1, 1, 1, 1], [1, 1, 1, 1]], pd.Index([3, 4], name="C"), MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=[("A", "a"), "B"]), ), ], ) def test_unstack_mixed_type_name_in_multiindex(unstack_idx, expected_values, expected_index, expected_columns): # GH 19966
def create_fip(temporary_store=None, year=None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection='erfs', config_files_directory=config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table=year_specific_by_generic["foyer"], variables=erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace=True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int( nb_pac_max ) == nb_pac_max, "nb_pac_max = {} which is not an integer".format( nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns=columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1:5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace=True) fip.drop(['level_0'], axis=1, inplace=True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns=['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace=True) fip = fip.reset_index() fip.drop(['pac_number'], axis=1, inplace=True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info( u" 1.3 : on enlève les individus F pour lesquels il existe un individu G" ) type_FG = fip[fip.type_pac.isin( ['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset=['declaration', 'naia'], take_last=True) type_FG['is_twin'] = type_FG.duplicated( subset=['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset=['declaration', 'naia'], take_last=True) type_HI['is_twin'] = type_HI.duplicated( subset=['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info( u"{} F, G, H or I non redundant pac kept over {} potential candidates". format(fip['to_keep'].sum(), len(fip))) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all( ), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info( u"Warning : no link between pac and noindiv for both pacInd1&2" ) else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format( pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format( pacInd.type_pac.value_counts(dropna=False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format( pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format( pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype( 'int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[[ "declar1", "noidec", "ident", "rga", "ztsai", "ztsao" ]].copy() individec1 = individec1.rename(columns={'declar1': 'declaration'}) fip1 = fip.merge(individec1, on='declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec2.rename(columns={'declar2': 'declaration'}, inplace=True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype( 'float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format( fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
def test_get_loc_level(): index = MultiIndex( levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], codes=[ np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0]), ], ) loc, new_index = index.get_loc_level((0, 1)) expected = slice(1, 2) exp_index = index[expected].droplevel(0).droplevel(0) assert loc == expected assert new_index.equals(exp_index) loc, new_index = index.get_loc_level((0, 1, 0)) expected = 1 assert loc == expected assert new_index is None with pytest.raises(KeyError, match=r"^\(2, 2\)$"): index.get_loc_level((2, 2)) # GH 22221: unused label with pytest.raises(KeyError, match=r"^2$"): index.drop(2).get_loc_level(2) # Unused label on unsorted level: with pytest.raises(KeyError, match=r"^2$"): index.drop(1, level=2).get_loc_level(2, level=2) index = MultiIndex( levels=[[2000], list(range(4))], codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])], ) result, new_index = index.get_loc_level((2000, slice(None, None))) expected = slice(None, None) assert result == expected assert new_index.equals(index.droplevel(0))
def test_sort_index_nan_multiindex(self): # GH#14784 # incorrect sorting w.r.t. nans tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) df2 = DataFrame({ "date": pd.DatetimeIndex([ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ]), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, np.nan, 280, 259, np.nan, 623, 90, 312, np.nan, 301, 359, 801, ], "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], }).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position last result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) # sorting frame with removed rows result = df2.dropna().sort_index() expected = df2.sort_index().dropna() tm.assert_frame_equal(result, expected) # sorting series, default nan position is last result = s.sort_index() expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position last result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected)
from pandas import ( Index, MultiIndex, Series, ) import pandas._testing as tm @pytest.mark.parametrize( "arr, idx", [ ([1, 2, 3, 4], [0, 2, 1, 3]), ([1, np.nan, 3, np.nan], [0, 2, 1, 3]), ( [1, np.nan, 3, np.nan], MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c"), (3, "c")]), ), ], ) def test_equals(arr, idx): s1 = Series(arr, index=idx) s2 = s1.copy() assert s1.equals(s2) s1[1] = 9 assert not s1.equals(s2) @pytest.mark.parametrize( "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {
class TestDataFrameSortIndex: def test_sort_index_and_reconstruction_doc_example(self): # doc example df = DataFrame( {"value": [1, 2, 3, 4]}, index=MultiIndex(levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) assert df.index.is_lexsorted() assert not df.index.is_monotonic # sort it expected = DataFrame( {"value": [2, 1, 4, 3]}, index=MultiIndex(levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = df.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # reconstruct result = df.sort_index().copy() result.index = result.index._sort_levels_monotonic() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) def test_sort_index_non_existent_label_multiindex(self): # GH#12261 df = DataFrame(0, columns=[], index=MultiIndex.from_product([[], []])) df.loc["b", "2"] = 1 df.loc["a", "3"] = 1 result = df.sort_index().index.is_monotonic assert result is True def test_sort_index_reorder_on_ops(self): # GH#15687 df = DataFrame( np.random.randn(8, 2), index=MultiIndex.from_product( [["a", "b"], ["big", "small"], ["red", "blu"]], names=["letter", "size", "color"], ), columns=["near", "far"], ) df = df.sort_index() def my_func(group): group.index = ["newz", "newa"] return group result = df.groupby( level=["letter", "size"]).apply(my_func).sort_index() expected = MultiIndex.from_product( [["a", "b"], ["big", "small"], ["newa", "newz"]], names=["letter", "size", None], ) tm.assert_index_equal(result.index, expected) def test_sort_index_nan_multiindex(self): # GH#14784 # incorrect sorting w.r.t. nans tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) df2 = DataFrame({ "date": pd.DatetimeIndex([ "20121002", "20121007", "20130130", "20130202", "20130305", "20121002", "20121207", "20130130", "20130202", "20130305", "20130202", "20130305", ]), "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], "whole_cost": [ 1790, np.nan, 280, 259, np.nan, 623, 90, 312, np.nan, 301, 359, 801, ], "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], }).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position last result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) # sorting frame with removed rows result = df2.dropna().sort_index() expected = df2.sort_index().dropna() tm.assert_frame_equal(result, expected) # sorting series, default nan position is last result = s.sort_index() expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position last result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected) def test_sort_index_nan(self): # GH#3917 # Test DataFrame with nan label df = DataFrame( { "A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5] }, index=[1, 2, 3, 4, 5, 6, np.nan], ) # NaN label, ascending=True, na_position='last' sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") expected = DataFrame( { "A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5] }, index=[1, 2, 3, 4, 5, 6, np.nan], ) tm.assert_frame_equal(sorted_df, expected) # NaN label, ascending=True, na_position='first' sorted_df = df.sort_index(na_position="first") expected = DataFrame( { "A": [4, 1, 2, np.nan, 1, 6, 8], "B": [5, 9, np.nan, 5, 2, 5, 4] }, index=[np.nan, 1, 2, 3, 4, 5, 6], ) tm.assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='last' sorted_df = df.sort_index(kind="quicksort", ascending=False) expected = DataFrame( { "A": [8, 6, 1, np.nan, 2, 1, 4], "B": [4, 5, 2, 5, np.nan, 9, 5] }, index=[6, 5, 4, 3, 2, 1, np.nan], ) tm.assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='first' sorted_df = df.sort_index(kind="quicksort", ascending=False, na_position="first") expected = DataFrame( { "A": [4, 8, 6, 1, np.nan, 2, 1], "B": [5, 4, 5, 2, 5, np.nan, 9] }, index=[np.nan, 6, 5, 4, 3, 2, 1], ) tm.assert_frame_equal(sorted_df, expected) def test_sort_index_multi_index(self): # GH#25775, testing that sorting by index works with a multi-index. df = DataFrame({ "a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc") }) result = df.set_index(list("abc")).sort_index(level=list("ba")) expected = DataFrame({ "a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca") }) expected = expected.set_index(list("abc")) tm.assert_frame_equal(result, expected) def test_sort_index_inplace(self): frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]) # axis=0 unordered = frame.loc[[3, 2, 4, 1]] a_id = id(unordered["A"]) df = unordered.copy() return_value = df.sort_index(inplace=True) assert return_value is None expected = frame tm.assert_frame_equal(df, expected) assert a_id != id(df["A"]) df = unordered.copy() return_value = df.sort_index(ascending=False, inplace=True) assert return_value is None expected = frame[::-1] tm.assert_frame_equal(df, expected) # axis=1 unordered = frame.loc[:, ["D", "B", "C", "A"]] df = unordered.copy() return_value = df.sort_index(axis=1, inplace=True) assert return_value is None expected = frame tm.assert_frame_equal(df, expected) df = unordered.copy() return_value = df.sort_index(axis=1, ascending=False, inplace=True) assert return_value is None expected = frame.iloc[:, ::-1] tm.assert_frame_equal(df, expected) def test_sort_index_different_sortorder(self): A = np.arange(20).repeat(5) B = np.tile(np.arange(5), 20) indexer = np.random.permutation(100) A = A.take(indexer) B = B.take(indexer) df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) # test with multiindex, too idf = df.set_index(["A", "B"]) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) tm.assert_frame_equal(result, expected) # also, Series! result = idf["C"].sort_index(ascending=[1, 0]) tm.assert_series_equal(result, expected["C"]) def test_sort_index_level(self): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(level="A", sort_remaining=False) expected = df tm.assert_frame_equal(result, expected) result = df.sort_index(level=["A", "B"], sort_remaining=False) expected = df tm.assert_frame_equal(result, expected) # Error thrown by sort_index when # first index is sorted last (GH#26053) result = df.sort_index(level=["C", "B", "A"]) expected = df.iloc[[1, 0]] tm.assert_frame_equal(result, expected) result = df.sort_index(level=["B", "C", "A"]) expected = df.iloc[[1, 0]] tm.assert_frame_equal(result, expected) result = df.sort_index(level=["C", "A"]) expected = df.iloc[[1, 0]] tm.assert_frame_equal(result, expected) def test_sort_index_categorical_index(self): df = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), }).set_index("B") result = df.sort_index() expected = df.iloc[[4, 0, 1, 5, 2, 3]] tm.assert_frame_equal(result, expected) result = df.sort_index(ascending=False) expected = df.iloc[[2, 3, 0, 1, 5, 4]] tm.assert_frame_equal(result, expected) def test_sort_index(self): # GH#13496 frame = DataFrame( np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"], ) # axis=0 : sort rows by index labels unordered = frame.loc[[3, 2, 4, 1]] result = unordered.sort_index(axis=0) expected = frame tm.assert_frame_equal(result, expected) result = unordered.sort_index(ascending=False) expected = frame[::-1] tm.assert_frame_equal(result, expected) # axis=1 : sort columns by column names unordered = frame.iloc[:, [2, 1, 3, 0]] result = unordered.sort_index(axis=1) tm.assert_frame_equal(result, frame) result = unordered.sort_index(axis=1, ascending=False) expected = frame.iloc[:, ::-1] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("level", ["A", 0]) # GH#21052 def test_sort_index_multiindex(self, level): # GH#13496 # sort rows by specified level of multi-index mi = MultiIndex.from_tuples([[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC")) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) expected_mi = MultiIndex.from_tuples([[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC")) expected = DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) tm.assert_frame_equal(result, expected) # sort_remaining=False expected_mi = MultiIndex.from_tuples([[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC")) expected = DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) tm.assert_frame_equal(result, expected) def test_sort_index_intervalindex(self): # this is a de-facto sort via unstack # confirming that we sort in the order of the bins y = Series(np.random.randn(100)) x1 = Series(np.sign(np.random.randn(100))) x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) result = model.groupby(["X1", "X2"], observed=True).mean().unstack() expected = IntervalIndex.from_tuples([(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right") result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_dict, sorted_dict, ascending, ignore_index, output_index", [ ({ "A": [1, 2, 3] }, { "A": [2, 3, 1] }, False, True, [0, 1, 2]), ({ "A": [1, 2, 3] }, { "A": [1, 3, 2] }, True, True, [0, 1, 2]), ({ "A": [1, 2, 3] }, { "A": [2, 3, 1] }, False, False, [5, 3, 2]), ({ "A": [1, 2, 3] }, { "A": [1, 3, 2] }, True, False, [2, 3, 5]), ], ) def test_sort_index_ignore_index(self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index): # GH 30114 original_index = [2, 5, 3] df = DataFrame(original_dict, index=original_index) expected_df = DataFrame(sorted_dict, index=output_index) kwargs = { "ascending": ascending, "ignore_index": ignore_index, "inplace": inplace, } if inplace: result_df = df.copy() result_df.sort_index(**kwargs) else: result_df = df.sort_index(**kwargs) tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize( "original_dict, sorted_dict, ascending, ignore_index, output_index", [ ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [1, 2], "M2": [3, 4] }, True, True, [0, 1], ), ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [2, 1], "M2": [4, 3] }, False, True, [0, 1], ), ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [1, 2], "M2": [3, 4] }, True, False, MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")), ), ( { "M1": [1, 2], "M2": [3, 4] }, { "M1": [2, 1], "M2": [4, 3] }, False, False, MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")), ), ], ) def test_sort_index_ignore_index_multi_index(self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index): # GH 30114, this is to test ignore_index on MulitIndex of index mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")) df = DataFrame(original_dict, index=mi) expected_df = DataFrame(sorted_dict, index=output_index) kwargs = { "ascending": ascending, "ignore_index": ignore_index, "inplace": inplace, } if inplace: result_df = df.copy() result_df.sort_index(**kwargs) else: result_df = df.sort_index(**kwargs) tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) def test_sort_index_categorical_multiindex(self): # GH#15058 df = DataFrame({ "a": range(6), "l1": pd.Categorical( ["a", "a", "b", "b", "c", "c"], categories=["c", "a", "b"], ordered=True, ), "l2": [0, 1, 0, 1, 0, 1], }) result = df.set_index(["l1", "l2"]).sort_index() expected = DataFrame( [4, 5, 0, 1, 2, 3], columns=["a"], index=MultiIndex( levels=[ pd.CategoricalIndex( ["c", "a", "b"], categories=["c", "a", "b"], ordered=True, name="l1", dtype="category", ), [0, 1], ], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], names=["l1", "l2"], ), ) tm.assert_frame_equal(result, expected) def test_sort_index_and_reconstruction(self): # GH#15622 # lexsortedness should be identical # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list("ab")) expected = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]), ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = result.sort_index() assert result.index.is_lexsorted() tm.assert_frame_equal(result, expected) concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # GH#14015 df = DataFrame( [[1, 2], [6, 7]], columns=MultiIndex.from_tuples( [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], names=["l1", "Date"], ), ) df.columns = df.columns.set_levels(pd.to_datetime( df.columns.levels[1]), level=1) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic result = df.sort_index(axis=1, level=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic # TODO: better name, de-duplicate with test_sort_index_level above def test_sort_index_level2(self): mi = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=mi, columns=Index(["A", "B", "C"], name="exp"), ) df = frame.copy() df.index = np.arange(len(df)) # axis=1 # series a_sorted = frame["A"].sort_index(level=0) # preserve names assert a_sorted.index.names == frame.index.names # inplace rs = frame.copy() return_value = rs.sort_index(level=0, inplace=True) assert return_value is None tm.assert_frame_equal(rs, frame.sort_index(level=0)) def test_sort_index_level_large_cardinality(self): # GH#2684 (int64) index = MultiIndex.from_arrays([np.arange(4000)] * 3) df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) # it works! result = df.sort_index(level=0) assert result.index.lexsort_depth == 3 # GH#2684 (int32) index = MultiIndex.from_arrays([np.arange(4000)] * 3) df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) # it works! result = df.sort_index(level=0) assert (result.dtypes.values == df.dtypes.values).all() assert result.index.lexsort_depth == 3 def test_sort_index_level_by_name(self): mi = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=mi, columns=Index(["A", "B", "C"], name="exp"), ) frame.index.names = ["first", "second"] result = frame.sort_index(level="second") expected = frame.sort_index(level=1) tm.assert_frame_equal(result, expected) def test_sort_index_level_mixed(self): mi = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) frame = DataFrame( np.random.randn(10, 3), index=mi, columns=Index(["A", "B", "C"], name="exp"), ) sorted_before = frame.sort_index(level=1) df = frame.copy() df["foo"] = "bar" sorted_after = df.sort_index(level=1) tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = frame.T sorted_before = dft.sort_index(level=1, axis=1) dft["foo", "three"] = "bar" sorted_after = dft.sort_index(level=1, axis=1) tm.assert_frame_equal( sorted_before.drop([("foo", "three")], axis=1), sorted_after.drop([("foo", "three")], axis=1), ) def test_sort_index_preserve_levels(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data result = frame.sort_index() assert result.index.names == frame.index.names @pytest.mark.parametrize( "gen,extra", [ ([1.0, 3.0, 2.0, 5.0], 4.0), ([1, 3, 2, 5], 4), ( [ Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102"), Timestamp("20130105"), ], Timestamp("20130104"), ), (["1one", "3one", "2one", "5one"], "4one"), ], ) def test_sort_index_multilevel_repr_8017(self, gen, extra): np.random.seed(0) data = np.random.randn(3, 4) columns = MultiIndex.from_tuples([("red", i) for i in gen]) df = DataFrame(data, index=list("def"), columns=columns) df2 = pd.concat( [ df, DataFrame( "world", index=list("def"), columns=MultiIndex.from_tuples([("red", extra)]), ), ], axis=1, ) # check that the repr is good # make sure that we have a correct sparsified repr # e.g. only 1 header of read assert str(df2).splitlines()[0].split() == ["red"] # GH 8017 # sorting fails after columns added # construct single-dtype then sort result = df.copy().sort_index(axis=1) expected = df.iloc[:, [0, 2, 1, 3]] tm.assert_frame_equal(result, expected) result = df2.sort_index(axis=1) expected = df2.iloc[:, [0, 2, 1, 4, 3]] tm.assert_frame_equal(result, expected) # setitem then sort result = df.copy() result[("red", extra)] = "world" result = result.sort_index(axis=1) tm.assert_frame_equal(result, expected)
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list('abc'), ['one', 'two', 'three'], [1, 2, 3]], names=['first', 'second', 'third']) df = DataFrame(np.arange(27 * 3).reshape(27, 3), index=index, columns=['value1', 'value2', 'value3']).sort_index() idx = pd.IndexSlice for op in ['add', 'sub', 'mul', 'div', 'truediv']: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level='third', axis=0) expected = pd.concat([ opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems() ]).sort_index() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ['two', 'three']) result = getattr(df, op)(x, level='second', axis=0) expected = (pd.concat([ opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems() ]).reindex_like(df).sort_index()) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) s = pd.Series({'a': 1, 'b': 2}) df2 = df.copy() df2.columns.names = ['lvl0', 'lvl1'] s2 = s.copy() s2.index.name = 'lvl1' # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level='lvl1') res6 = df2.mul(s2, axis=1, level='lvl1') exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ['lvl0', 'lvl1'] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp)
def test_tz_convert_and_localize(self, fn): l0 = date_range('20140701', periods=5, freq='D') l1 = date_range('20140701', periods=5, freq='D') int_idx = Index(range(5)) if fn == 'tz_convert': l0 = l0.tz_localize('UTC') l1 = l1.tz_localize('UTC') for idx in [l0, l1]: l0_expected = getattr(idx, fn)('US/Pacific') l1_expected = getattr(idx, fn)('US/Pacific') df1 = DataFrame(np.ones(5), index=l0) df1 = getattr(df1, fn)('US/Pacific') assert_index_equal(df1.index, l0_expected) # MultiIndex # GH7846 df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) df3 = getattr(df2, fn)('US/Pacific', level=0) assert not df3.index.levels[0].equals(l0) assert_index_equal(df3.index.levels[0], l0_expected) assert_index_equal(df3.index.levels[1], l1) assert not df3.index.levels[1].equals(l1_expected) df3 = getattr(df2, fn)('US/Pacific', level=1) assert_index_equal(df3.index.levels[0], l0) assert not df3.index.levels[0].equals(l0_expected) assert_index_equal(df3.index.levels[1], l1_expected) assert not df3.index.levels[1].equals(l1) df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) # TODO: untested df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa assert_index_equal(df3.index.levels[0], l0) assert not df3.index.levels[0].equals(l0_expected) assert_index_equal(df3.index.levels[1], l1_expected) assert not df3.index.levels[1].equals(l1) # Bad Inputs # Not DatetimeIndex / PeriodIndex with pytest.raises(TypeError, match='DatetimeIndex'): df = DataFrame(index=int_idx) df = getattr(df, fn)('US/Pacific') # Not DatetimeIndex / PeriodIndex with pytest.raises(TypeError, match='DatetimeIndex'): df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) df = getattr(df, fn)('US/Pacific', level=0) # Invalid level with pytest.raises(ValueError, match='not valid'): df = DataFrame(index=l0) df = getattr(df, fn)('US/Pacific', level=1)
def test_sort_index_and_reconstruction(self): # GH#15622 # lexsortedness should be identical # across MultiIndex construction methods df = DataFrame([[1, 1], [2, 2]], index=list("ab")) expected = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_tuples([(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]), ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], index=MultiIndex(levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]), ) result = result.sort_index() assert result.index.is_lexsorted() tm.assert_frame_equal(result, expected) concatted = pd.concat([df, df], keys=[0.8, 0.5]) result = concatted.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) # GH#14015 df = DataFrame( [[1, 2], [6, 7]], columns=MultiIndex.from_tuples( [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], names=["l1", "Date"], ), ) df.columns = df.columns.set_levels(pd.to_datetime( df.columns.levels[1]), level=1) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic result = df.sort_index(axis=1, level=1) assert result.columns.is_lexsorted() assert result.columns.is_monotonic
def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): df = DataFrame( np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples(df.columns.to_numpy(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"], ) for multiindex_columns in ( [0, 1, 2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2], [1, 2, 3], [2, 3, 4], [0, 1], [0, 2], [0, 3], [0], [2], [4], ): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() _test_stack_with_multiindex( full_multiindex[multiindex_columns]) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) expected = DataFrame( [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], index=MultiIndex( levels=[[0, 1], ["u", "x", "y", "z"]], codes=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"], ), columns=Index(["B", "C"], name="Upper"), dtype=df.dtypes[0], ) tm.assert_frame_equal(result, expected)
def test_construction_list_tuples_nan(self, na_value, vtype): # GH#18505 : valid tuples containing NaN values = [(1, "two"), (3.0, na_value)] result = Index(vtype(values)) expected = MultiIndex.from_tuples(values) tm.assert_index_equal(result, expected)
def frame_random_data_integer_multi_index(): levels = [[0, 1], [0, 1, 2]] codes = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, codes=codes) return DataFrame(np.random.randn(6, 2), index=index)
def test_unstack_nan_index(self): # GH7466 def cast(val): val_str = "" if val != val else val return f"{val_str:1}" def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(map(cast, right)) assert left == right df = DataFrame({ "jim": ["a", "b", np.nan, "d"], "joe": ["w", "x", "y", "z"], "jolie": ["a.w", "b.x", " .y", "d.z"], }) left = df.set_index(["jim", "joe"]).unstack()["jolie"] right = df.set_index(["joe", "jim"]).unstack()["jolie"].T tm.assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) verify(udf["jolie"]) df = DataFrame({ "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 + ["e"] * 2 + ["b"] * 5, "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 + [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2, "3rd": [ 67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51, ], }) df["4th"], df["5th"] = ( df.apply(lambda r: ".".join(map(cast, r)), axis=1), df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), ) for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) for col in ["4th", "5th"]: verify(udf[col]) # GH7403 df = pd.DataFrame({ "A": list("aaaabbbb"), "B": range(8), "C": range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [ [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], ] vals = list(map(list, zip(*vals))) idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = DataFrame({ "A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8) }) df.iloc[2, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = pd.DataFrame({ "A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8) }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] cols = MultiIndex(levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH7401 df = pd.DataFrame({ "A": list("aaaaabbbbb"), "B": (date_range("2012-01-01", periods=5).tolist() * 2), "C": np.arange(10), }) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) idx = Index(["a", "b"], name="A") cols = MultiIndex( levels=[["C"], date_range("2012-01-01", periods=5)], codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, "B"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH4862 vals = [ ["Hg", np.nan, np.nan, 680585148], ["U", 0.0, np.nan, 680585148], ["Pb", 7.07e-06, np.nan, 680585148], ["Sn", 2.3614e-05, 0.0133, 680607017], ["Ag", 0.0, 0.0133, 680607017], ["Hg", -0.00015, 0.0133, 680607017], ] df = DataFrame( vals, columns=["agent", "change", "dosage", "s_id"], index=[17263, 17264, 17265, 17266, 17267, 17268], ) left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() vals = [ [np.nan, np.nan, 7.07e-06, np.nan, 0.0], [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], ] idx = MultiIndex( levels=[[680585148, 680607017], [0.0133]], codes=[[0, 1], [-1, 0]], names=["s_id", "dosage"], ) cols = MultiIndex( levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, "agent"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) tm.assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame({ "1st": [1, 2, 1, 2, 1, 2], "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), "jim": 100 + np.arange(6), "joe": (np.random.randn(6) * 10).round(2), }) df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) for col in ["jim", "joe"]: for _, r in df.iterrows(): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key]
def test_loader_given_multiple_columns(self): class Loader1DataSet1(DataSet): col1 = Column(float) col2 = Column(float32) class Loader1DataSet2(DataSet): col1 = Column(float32) col2 = Column(float32) class Loader2DataSet(DataSet): col1 = Column(float32) col2 = Column(float32) constants1 = {Loader1DataSet1.col1: 1, Loader1DataSet1.col2: 2, Loader1DataSet2.col1: 3, Loader1DataSet2.col2: 4} loader1 = RecordingPrecomputedLoader(constants=constants1, dates=self.dates, sids=self.assets) constants2 = {Loader2DataSet.col1: 5, Loader2DataSet.col2: 6} loader2 = RecordingPrecomputedLoader(constants=constants2, dates=self.dates, sids=self.assets) engine = SimplePipelineEngine( lambda column: loader2 if column.dataset == Loader2DataSet else loader1, self.dates, self.asset_finder, ) pipe_col1 = RollingSumSum(inputs=[Loader1DataSet1.col1, Loader1DataSet2.col1, Loader2DataSet.col1], window_length=2) pipe_col2 = RollingSumSum(inputs=[Loader1DataSet1.col2, Loader1DataSet2.col2, Loader2DataSet.col2], window_length=3) pipe_col3 = RollingSumSum(inputs=[Loader2DataSet.col1], window_length=3) columns = OrderedDict([ ('pipe_col1', pipe_col1), ('pipe_col2', pipe_col2), ('pipe_col3', pipe_col3), ]) result = engine.run_pipeline( Pipeline(columns=columns), self.dates[2], # index is >= the largest window length - 1 self.dates[-1] ) min_window = min(pip_col.window_length for pip_col in itervalues(columns)) col_to_val = ChainMap(constants1, constants2) vals = {name: (sum(col_to_val[col] for col in pipe_col.inputs) * pipe_col.window_length) for name, pipe_col in iteritems(columns)} index = MultiIndex.from_product([self.dates[2:], self.assets]) def expected_for_col(col): val = vals[col] offset = columns[col].window_length - min_window return concatenate( [ full(offset * index.levshape[1], nan), full( (index.levshape[0] - offset) * index.levshape[1], val, float, ) ], ) expected = DataFrame( data={col: expected_for_col(col) for col in vals}, index=index, columns=columns, ) assert_frame_equal(result, expected) self.assertEqual(set(loader1.load_calls), {ColumnArgs.sorted_by_ds(Loader1DataSet1.col1, Loader1DataSet2.col1), ColumnArgs.sorted_by_ds(Loader1DataSet1.col2, Loader1DataSet2.col2)}) self.assertEqual(set(loader2.load_calls), {ColumnArgs.sorted_by_ds(Loader2DataSet.col1, Loader2DataSet.col2)})
def test_missing_key_raises_keyerror2(self): # GH#21168 KeyError, not "IndexingError: Too many indexers" ser = Series(-1, index=MultiIndex.from_product([[0, 1]] * 2)) with pytest.raises(KeyError, match=r"\(0, 3\)"): ser.loc[0, 3]
def test_per_axis_per_level_getitem(self): # GH6134 # example test case ix = MultiIndex.from_product( [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)]) df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] expected = df.loc[[ tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C3") ]] tm.assert_frame_equal(result, expected) expected = df.loc[[ tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C2" or c == "C3") ]] result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df = df.sort_index(axis=0).sort_index(axis=1) # identity result = df.loc[(slice(None), slice(None)), :] tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ["foo"])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) result = df.loc["A", "a"] expected = DataFrame( dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name="two"), columns=Index(["bar", "foo"], name="lvl1"), ) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.to_numpy())), index=ix) result = s.loc["A1":"A3", :, ["C1", "C3"]] expected = s.loc[[ tuple([a, b, c, d]) for a, b, c, d in s.index.values if (a == "A1" or a == "A2" or a == "A3") and ( c == "C1" or c == "C3") ]] tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) with pytest.raises(ValueError): df.loc[(slice(None), np.array([True, False])), :] # ambiguous notation # this is interpreted as slicing on both axes (GH #16396) result = df.loc[slice(None), [1]] expected = df.iloc[:, []] tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # not lexsorted assert df.index.lexsort_depth == 2 df = df.sort_index(level=1, axis=0) assert df.index.lexsort_depth == 0 msg = ("MultiIndex slicing requires the index to be " r"lexsorted: slicing on levels \[1\], lexsort depth 0") with pytest.raises(UnsortedIndexError, match=msg): df.loc[(slice(None), slice("bar")), :] # GH 16734: not sorted, but no real slicing result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :])
def single_level_multiindex(): """single level MultiIndex""" return MultiIndex(levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"])
"a\n04.15.2016", dict(parse_dates=True, index_col=0), DataFrame(index=DatetimeIndex(["2016-04-15"], name="a")), ), ( "a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]), DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"]), ), ( "a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]), DataFrame(index=MultiIndex.from_tuples([(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"])), ), ], ) def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): # see gh-14066 parser = all_parsers result = parser.read_csv(StringIO(data), thousands=".", **kwargs) tm.assert_frame_equal(result, expected) def test_parse_date_time_multi_level_column_name(all_parsers): data = """\ D,T,A,B
def test_per_axis_per_level_setitem(self): # test index maker idx = pd.IndexSlice # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"]) columns = MultiIndex.from_tuples( [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"], ) df_orig = DataFrame(np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns) df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity df = df_orig.copy() df.loc[(slice(None), slice(None)), :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 tm.assert_frame_equal(df, expected) # index df = df_orig.copy() df.loc[(slice(None), [1]), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, 1] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 tm.assert_frame_equal(df, expected) # columns df = df_orig.copy() df.loc[:, (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 tm.assert_frame_equal(df, expected) # both df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc["A", "a"] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([[100, 100], [100, 100]], dtype="int64") expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) # not enough values df = df_orig.copy() with pytest.raises(ValueError): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([[100], [100, 100]], dtype="int64") with pytest.raises(ValueError): df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array([100, 100, 100, 100], dtype="int64") # with an alignable rhs df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] = (df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() rhs.loc[:, ("c", "bah")] = 10 df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected)
) result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes expected = Series( [np.dtype("datetime64[ns]"), object, object, np.int64, object], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index", [ pd.CategoricalIndex(list("abc")), pd.interval_range(0, 3), pd.period_range("2020", periods=3, freq="D"), MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), ], ) def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) result = df.groupby("group").apply(lambda x: x) tm.assert_frame_equal(result, df) @pytest.mark.parametrize( "function, expected_values", [ (lambda x: x.index.to_list(), [[0, 1], [2, 3]]), (lambda x: set(x.index.to_list()), [{0, 1}, {2, 3}]), (lambda x: tuple(x.index.to_list()), [(0, 1), (2, 3)]),
def test_multiindex_slicers_non_unique(self): # GH 7106 # non-unique mi index support df = (DataFrame( dict( A=["foo", "foo", "foo", "foo"], B=["a", "a", "a", "a"], C=[1, 2, 1, 3], D=[1, 2, 3, 4], )).set_index(["A", "B", "C"]).sort_index()) assert not df.index.is_unique expected = (DataFrame( dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])).set_index(["A", "B", "C"]).sort_index()) result = df.loc[(slice(None), slice(None), 1), :] tm.assert_frame_equal(result, expected) # this is equivalent of an xs expression result = df.xs(1, level=2, drop_level=False) tm.assert_frame_equal(result, expected) df = (DataFrame( dict( A=["foo", "foo", "foo", "foo"], B=["a", "a", "a", "a"], C=[1, 2, 1, 2], D=[1, 2, 3, 4], )).set_index(["A", "B", "C"]).sort_index()) assert not df.index.is_unique expected = (DataFrame( dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])).set_index(["A", "B", "C"]).sort_index()) result = df.loc[(slice(None), slice(None), 1), :] assert not result.index.is_unique tm.assert_frame_equal(result, expected) # GH12896 # numpy-implementation dependent bug ints = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, 17, 18, 19, 200000, 200000, ] n = len(ints) idx = MultiIndex.from_arrays([["a"] * n, ints]) result = Series([1] * n, index=idx) result = result.sort_index() result = result.loc[(slice(None), slice(100000))] expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() tm.assert_series_equal(result, expected)
def test_from_product_invalid_input(invalid_input): msg = (r"Input must be a list / sequence of iterables|" "Input must be list-like") with pytest.raises(TypeError, match=msg): MultiIndex.from_product(iterables=invalid_input)
def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # assignment # GH 3687 arr = np.random.randn(3, 2) idx = lrange(2) df = DataFrame(arr, columns=['A', 'A']) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) idx = date_range('20130101', periods=4, freq='Q-NOV') df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['a', 'a', 'a', 'a']) df.columns = idx expected = DataFrame( [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=['foo', 'bar', 'foo', 'hello']) df['string'] = 'bah' expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], [2, 1, 3, 5, 'bah']], columns=['foo', 'bar', 'foo', 'hello', 'string']) check(df, expected) with tm.assert_raises_regex(ValueError, 'Length of value'): df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) # insert same dtype df['foo2'] = 3 expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], [2, 1, 3, 5, 'bah', 3]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # set (non-dup) df['foo2'] = 4 expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], [2, 1, 3, 5, 'bah', 4]], columns=['foo', 'bar', 'foo', 'hello', 'string', 'foo2']) check(df, expected) df['foo2'] = 3 # delete (non dup) del df['bar'] expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], [2, 3, 5, 'bah', 3]], columns=['foo', 'foo', 'hello', 'string', 'foo2']) check(df, expected) # try to delete again (its not consolidated) del df['hello'] expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # consolidate df = df._consolidate() expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], [2, 3, 'bah', 3]], columns=['foo', 'foo', 'string', 'foo2']) check(df, expected) # insert df.insert(2, 'new_col', 5.) expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], [2, 3, 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'string', 'foo2']) check(df, expected) # insert a dup tm.assert_raises_regex(ValueError, 'cannot insert', df.insert, 2, 'new_col', 4.) df.insert(2, 'new_col', 4., allow_duplicates=True) expected = DataFrame([[1, 1, 4., 5., 'bah', 3], [1, 2, 4., 5., 'bah', 3], [2, 3, 4., 5., 'bah', 3]], columns=['foo', 'foo', 'new_col', 'new_col', 'string', 'foo2']) check(df, expected) # delete (dup) del df['foo'] expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], [4., 5., 'bah', 3]], columns=['new_col', 'new_col', 'string', 'foo2']) assert_frame_equal(df, expected) # dup across dtypes df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], columns=['foo', 'bar', 'foo', 'hello']) check(df) df['foo2'] = 7. expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], [2, 1, 3., 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) result = df['foo'] expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], columns=['foo', 'foo']) check(result, expected) # multiple replacements df['foo'] = 'string' expected = DataFrame([['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.], ['string', 1, 'string', 5, 7.]], columns=['foo', 'bar', 'foo', 'hello', 'foo2']) check(df, expected) del df['foo'] expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ 'bar', 'hello', 'foo2']) check(df, expected) # values df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( {'RT': [0.0454], 'TClose': [22.02], 'TExg': [0.0422]}, index=MultiIndex.from_tuples([(600809, 20130331)], names=['STK_ID', 'RPT_Date'])) df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331], 'STK_ID': [600809] * 3, 'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')], 'TClose': [38.05, 41.66, 30.01]}, index=MultiIndex.from_tuples( [(600809, 20120930), (600809, 20121231), (600809, 20130331)], names=['STK_ID', 'RPT_Date'])) k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) result = k.rename( columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) str(result) result.dtypes expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, u('饡驦'), 30.01]], columns=['RT', 'TClose', 'TExg', 'RPT_Date', 'STK_ID', 'STK_Name', 'QT_Close']) .set_index(['STK_ID', 'RPT_Date'], drop=False)) assert_frame_equal(result, expected) # reindex is invalid! df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) pytest.raises(ValueError, df.reindex, columns=['bar']) pytest.raises(ValueError, df.reindex, columns=['bar', 'foo']) # drop df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=['bar', 'a', 'a']) result = df.drop(['a'], axis=1) expected = DataFrame([[1], [1], [1]], columns=['bar']) check(result, expected) result = df.drop('a', axis=1) check(result, expected) # describe df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=['bar', 'a', 'a'], dtype='float64') result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'A']) for index in [df.index, pd.Index(list('edcba'))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) expected_df = DataFrame({'A': expected_ser, 'B': this_df['B'], 'A': expected_ser}, columns=['A', 'B', 'A']) this_df['A'] = index check(this_df, expected_df) # operations for op in ['__add__', '__mul__', '__sub__', '__truediv__']: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) expected.columns = ['A', 'A'] df.columns = ['A', 'A'] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) df['that'] = 1.0 check(df, expected) df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) expected = DataFrame(1, index=range(5), columns=['that', 'that']) df['that'] = 1 check(df, expected)