def setUp(self): self.data = {'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B' : [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C' : np.arange(10), 'D' : [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = DateRange('1/1/2011', periods=10) self.frame = SparseDataFrame(self.data, index=self.dates) self.iframe = SparseDataFrame(self.data, index=self.dates, default_kind='integer') values = self.frame.values.copy() values[np.isnan(values)] = 0 self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=0, index=self.dates) values = self.frame.values.copy() values[np.isnan(values)] = 2 self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=2, index=self.dates) self.empty = SparseDataFrame()
def test_as_matrix(self): empty = self.empty.as_matrix() self.assert_(empty.shape == (0, 0)) no_cols = SparseDataFrame(index=np.arange(10)) mat = no_cols.as_matrix() self.assert_(mat.shape == (10, 0)) no_index = SparseDataFrame(columns=np.arange(10)) mat = no_index.as_matrix() self.assert_(mat.shape == (0, 10))
def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index)
def setUp(self): self.data = { "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], "C": np.arange(10), "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], } self.dates = DateRange("1/1/2011", periods=10) self.frame = SparseDataFrame(self.data, index=self.dates) self.iframe = SparseDataFrame(self.data, index=self.dates, default_kind="integer") values = self.frame.values.copy() values[np.isnan(values)] = 0 self.zframe = SparseDataFrame(values, columns=["A", "B", "C", "D"], default_fill_value=0, index=self.dates) values = self.frame.values.copy() values[np.isnan(values)] = 2 self.fill_frame = SparseDataFrame(values, columns=["A", "B", "C", "D"], default_fill_value=2, index=self.dates) self.empty = SparseDataFrame()
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def time_sparse_frame_constructor(self): SparseDataFrame(columns=np.arange(100), index=np.arange(1000))
def time_sparse_series_to_frame(self): SparseDataFrame(self.series)
class TestSparseDataFrame(TestCase, test_frame.SafeForSparse): klass = SparseDataFrame def setUp(self): self.data = {'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B' : [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C' : np.arange(10), 'D' : [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = DateRange('1/1/2011', periods=10) self.frame = SparseDataFrame(self.data, index=self.dates) self.iframe = SparseDataFrame(self.data, index=self.dates, default_kind='integer') values = self.frame.values.copy() values[np.isnan(values)] = 0 self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=0, index=self.dates) values = self.frame.values.copy() values[np.isnan(values)] = 2 self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=2, index=self.dates) self.empty = SparseDataFrame() def test_as_matrix(self): empty = self.empty.as_matrix() self.assert_(empty.shape == (0, 0)) no_cols = SparseDataFrame(index=np.arange(10)) mat = no_cols.as_matrix() self.assert_(mat.shape == (10, 0)) no_index = SparseDataFrame(columns=np.arange(10)) mat = no_index.as_matrix() self.assert_(mat.shape == (0, 10)) def test_copy(self): cp = self.frame.copy() self.assert_(isinstance(cp, SparseDataFrame)) assert_sp_frame_equal(cp, self.frame) self.assert_(cp.index is self.frame.index) def test_constructor(self): for col, series in self.frame.iteritems(): self.assert_(isinstance(series, SparseSeries)) self.assert_(isinstance(self.iframe['A'].sp_index, IntIndex)) # constructed zframe from matrix above self.assertEquals(self.zframe['A'].fill_value, 0) assert_almost_equal([0, 0, 0, 0, 1, 2, 3, 4, 5, 6], self.zframe['A'].values) # construct from nested dict data = {} for c, s in self.frame.iteritems(): data[c] = s.to_dict() sdf = SparseDataFrame(data) assert_sp_frame_equal(sdf, self.frame) # TODO: test data is copied from inputs # init dict with different index idx = self.frame.index[:5] cons = SparseDataFrame(self.frame._series, index=idx, columns=self.frame.columns, default_fill_value=self.frame.default_fill_value, default_kind=self.frame.default_kind) reindexed = self.frame.reindex(idx) assert_sp_frame_equal(cons, reindexed) def test_constructor_ndarray(self): # no index or columns sp = SparseDataFrame(self.frame.values) # 1d sp = SparseDataFrame(self.data['A'], index=self.dates, columns=['A']) assert_sp_frame_equal(sp, self.frame.reindex(columns=['A'])) # wrong length index / columns self.assertRaises(Exception, SparseDataFrame, self.frame.values, index=self.frame.index[:-1]) self.assertRaises(Exception, SparseDataFrame, self.frame.values, columns=self.frame.columns[:-1]) def test_constructor_empty(self): sp = SparseDataFrame() self.assert_(len(sp.index) == 0) self.assert_(len(sp.columns) == 0) def test_constructor_dataframe(self): dense = self.frame.to_dense() sp = SparseDataFrame(dense) assert_sp_frame_equal(sp, self.frame) def test_array_interface(self): res = np.sqrt(self.frame) dres = np.sqrt(self.frame.to_dense()) assert_frame_equal(res.to_dense(), dres) def test_pickle(self): def _test_roundtrip(frame): pickled = pickle.dumps(frame, protocol=pickle.HIGHEST_PROTOCOL) unpickled = pickle.loads(pickled) assert_sp_frame_equal(frame, unpickled) self._check_all(_test_roundtrip) def test_dense_to_sparse(self): df = DataFrame({'A' : [nan, nan, nan, 1, 2], 'B' : [1, 2, nan, nan, nan]}) sdf = df.to_sparse() self.assert_(isinstance(sdf, SparseDataFrame)) self.assert_(np.isnan(sdf.default_fill_value)) self.assert_(isinstance(sdf['A'].sp_index, BlockIndex)) testing.assert_frame_equal(sdf.to_dense(), df) sdf = df.to_sparse(kind='integer') self.assert_(isinstance(sdf['A'].sp_index, IntIndex)) df = DataFrame({'A' : [0, 0, 0, 1, 2], 'B' : [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) self.assertEquals(sdf.default_fill_value, 0) testing.assert_frame_equal(sdf.to_dense(), df) def test_sparse_to_dense(self): pass def test_sparse_series_ops(self): self._check_all(self._check_frame_ops) def _check_frame_ops(self, frame): fill = frame.default_fill_value def _compare_to_dense(a, b, da, db, op): sparse_result = op(a, b) dense_result = op(da, db) dense_result = dense_result.to_sparse(fill_value=fill) assert_sp_frame_equal(sparse_result, dense_result, exact_indices=False) if isinstance(a, DataFrame) and isinstance(db, DataFrame): mixed_result = op(a, db) self.assert_(isinstance(mixed_result, SparseDataFrame)) assert_sp_frame_equal(mixed_result, sparse_result, exact_indices=False) opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv'] ops = [getattr(operator, name) for name in opnames] fidx = frame.index # time series operations series = [frame['A'], frame['B'], frame['C'], frame['D'], frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]), SparseSeries([], index=[])] for op in ops: _compare_to_dense(frame, frame[::2], frame.to_dense(), frame[::2].to_dense(), op) for s in series: _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), op) _compare_to_dense(s, frame, s.to_dense(), frame.to_dense(), op) # cross-sectional operations series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]), frame.xs(fidx[7]), frame.xs(fidx[5])[:2]] for op in ops: for s in series: _compare_to_dense(frame, s, frame.to_dense(), s, op) _compare_to_dense(s, frame, s, frame.to_dense(), op) def test_op_corners(self): empty = self.empty + self.empty self.assert_(not empty) foo = self.frame + self.empty assert_sp_frame_equal(foo, self.frame * np.nan) foo = self.empty + self.frame assert_sp_frame_equal(foo, self.frame * np.nan) def test_scalar_ops(self): pass def test_getitem(self): pass def test_fancy_index_misc(self): # axis = 0 sliced = self.frame.ix[-2:, :] expected = self.frame.reindex(index=self.frame.index[-2:]) assert_sp_frame_equal(sliced, expected) # axis = 1 sliced = self.frame.ix[:, -2:] expected = self.frame.reindex(columns=self.frame.columns[-2:]) assert_sp_frame_equal(sliced, expected) def test_getitem_overload(self): # slicing sl = self.frame[:20] assert_sp_frame_equal(sl, self.frame.reindex(self.frame.index[:20])) # boolean indexing d = self.frame.index[5] indexer = self.frame.index > d subindex = self.frame.index[indexer] subframe = self.frame[indexer] self.assert_(np.array_equal(subindex, subframe.index)) self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) def test_setitem(self): def _check_frame(frame): N = len(frame) # insert SparseSeries frame['E'] = frame['A'] self.assert_(isinstance(frame['E'], SparseSeries)) assert_sp_series_equal(frame['E'], frame['A']) # insert SparseSeries differently-indexed to_insert = frame['A'][::2] frame['E'] = to_insert assert_series_equal(frame['E'].to_dense(), to_insert.to_dense().reindex(frame.index)) # insert Series frame['F'] = frame['A'].to_dense() self.assert_(isinstance(frame['F'], SparseSeries)) assert_sp_series_equal(frame['F'], frame['A']) # insert Series differently-indexed to_insert = frame['A'].to_dense()[::2] frame['G'] = to_insert assert_series_equal(frame['G'].to_dense(), to_insert.reindex(frame.index)) # insert ndarray frame['H'] = np.random.randn(N) self.assert_(isinstance(frame['H'], SparseSeries)) to_sparsify = np.random.randn(N) to_sparsify[N // 2:] = frame.default_fill_value frame['I'] = to_sparsify self.assertEquals(len(frame['I'].sp_values), N // 2) # insert ndarray wrong size self.assertRaises(Exception, frame.__setitem__, 'foo', np.random.randn(N - 1)) # scalar value frame['J'] = 5 self.assertEquals(len(frame['J'].sp_values), N) self.assert_((frame['J'].sp_values == 5).all()) frame['K'] = frame.default_fill_value self.assertEquals(len(frame['K'].sp_values), 0) self._check_all(_check_frame) def test_setitem_corner(self): self.frame['a'] = self.frame['B'] assert_sp_series_equal(self.frame['a'], self.frame['B']) def test_delitem(self): A = self.frame['A'] C = self.frame['C'] del self.frame['B'] self.assert_('B' not in self.frame) assert_sp_series_equal(self.frame['A'], A) assert_sp_series_equal(self.frame['C'], C) del self.frame['D'] self.assert_('D' not in self.frame) del self.frame['A'] self.assert_('A' not in self.frame) def test_set_columns(self): self.frame.columns = self.frame.columns self.assertRaises(Exception, setattr, self.frame, 'columns', self.frame.columns[:-1]) def test_set_index(self): self.frame.index = self.frame.index self.assertRaises(Exception, setattr, self.frame, 'index', self.frame.index[:-1]) def test_append(self): a = self.frame[:5] b = self.frame[5:] appended = a.append(b) assert_sp_frame_equal(appended, self.frame) a = self.frame.ix[:5, :3] b = self.frame.ix[5:] appended = a.append(b) assert_sp_frame_equal(appended.ix[:, :3], self.frame.ix[:, :3]) def test_apply(self): applied = self.frame.apply(np.sqrt) self.assert_(isinstance(applied, SparseDataFrame)) assert_almost_equal(applied.values, np.sqrt(self.frame.values)) applied = self.fill_frame.apply(np.sqrt) self.assert_(applied['A'].fill_value == np.sqrt(2)) # agg / broadcast applied = self.frame.apply(np.sum) assert_series_equal(applied, self.frame.to_dense().apply(np.sum)) broadcasted = self.frame.apply(np.sum, broadcast=True) self.assert_(isinstance(broadcasted, SparseDataFrame)) assert_frame_equal(broadcasted.to_dense(), self.frame.to_dense().apply(np.sum, broadcast=True)) self.assert_(self.empty.apply(np.sqrt) is self.empty) def test_applymap(self): # just test that it works result = self.frame.applymap(lambda x: x * 2) self.assert_(isinstance(result, SparseDataFrame)) def test_astype(self): self.assertRaises(Exception, self.frame.astype, np.int64) def test_fillna(self): self.assertRaises(NotImplementedError, self.frame.fillna, 0) def test_rename(self): # just check this works renamed = self.frame.rename(index=str) renamed = self.frame.rename(columns=lambda x: '%s%d' % (x, len(x))) def test_corr(self): res = self.frame.corr() assert_frame_equal(res, self.frame.to_dense().corr()) def test_describe(self): self.frame['foo'] = np.nan desc = self.frame.describe() def test_join(self): left = self.frame.ix[:, ['A', 'B']] right = self.frame.ix[:, ['C', 'D']] joined = left.join(right) assert_sp_frame_equal(joined, self.frame) right = self.frame.ix[:, ['B', 'D']] self.assertRaises(Exception, left.join, right) def test_reindex(self): def _check_frame(frame): index = frame.index sidx = index[::2] sidx2 = index[:5] sparse_result = frame.reindex(sidx) dense_result = frame.to_dense().reindex(sidx) assert_frame_equal(sparse_result.to_dense(), dense_result) assert_frame_equal(frame.reindex(list(sidx)).to_dense(), dense_result) sparse_result2 = sparse_result.reindex(index) dense_result2 = dense_result.reindex(index) assert_frame_equal(sparse_result2.to_dense(), dense_result2) # propagate CORRECT fill value assert_almost_equal(sparse_result.default_fill_value, frame.default_fill_value) assert_almost_equal(sparse_result['A'].fill_value, frame['A'].fill_value) # length zero length_zero = frame.reindex([]) self.assertEquals(len(length_zero), 0) self.assertEquals(len(length_zero.columns), len(frame.columns)) self.assertEquals(len(length_zero['A']), 0) # frame being reindexed has length zero length_n = length_zero.reindex(index) self.assertEquals(len(length_n), len(frame)) self.assertEquals(len(length_n.columns), len(frame.columns)) self.assertEquals(len(length_n['A']), len(frame)) # reindex columns reindexed = frame.reindex(columns=['A', 'B', 'Z']) self.assertEquals(len(reindexed.columns), 3) assert_almost_equal(reindexed['Z'].fill_value, frame.default_fill_value) self.assert_(np.isnan(reindexed['Z'].sp_values).all()) _check_frame(self.frame) _check_frame(self.iframe) _check_frame(self.zframe) _check_frame(self.fill_frame) # with copy=False reindexed = self.frame.reindex(self.frame.index, copy=False) reindexed['F'] = reindexed['A'] self.assert_('F' in self.frame) reindexed = self.frame.reindex(self.frame.index) reindexed['G'] = reindexed['A'] self.assert_('G' not in self.frame) def test_density(self): df = SparseDataFrame({'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B' : [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C' : np.arange(10), 'D' : [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) self.assertEquals(df.density, 0.75) def test_to_dense(self): def _check(frame): dense_dm = frame.to_dense() assert_frame_equal(frame, dense_dm) self._check_all(_check) def test_stack_sparse_frame(self): def _check(frame): dense_frame = frame.to_dense() wp = Panel.from_dict({'foo' : frame}) from_dense_lp = wp.to_long() from_sparse_lp = spm.stack_sparse_frame(frame) self.assert_(np.array_equal(from_dense_lp.values, from_sparse_lp.values)) _check(self.frame) _check(self.iframe) # for now self.assertRaises(Exception, _check, self.zframe) self.assertRaises(Exception, _check, self.fill_frame) def test_transpose(self): def _check(frame): transposed = frame.T untransposed = transposed.T assert_sp_frame_equal(frame, untransposed) self._check_all(_check) def test_shift(self): def _check(frame): shifted = frame.shift(0) self.assert_(shifted is not frame) assert_sp_frame_equal(shifted, frame) f = lambda s: s.shift(1) _dense_frame_compare(frame, f) f = lambda s: s.shift(-2) _dense_frame_compare(frame, f) f = lambda s: s.shift(2, timeRule='WEEKDAY') _dense_frame_compare(frame, f) f = lambda s: s.shift(2, offset=datetools.bday) _dense_frame_compare(frame, f) self._check_all(_check) def test_count(self): result = self.frame.count() dense_result = self.frame.to_dense().count() assert_series_equal(result, dense_result) result = self.frame.count(1) dense_result = self.frame.to_dense().count(1) assert_series_equal(result, dense_result) def test_cumsum(self): result = self.frame.cumsum() expected = self.frame.to_dense().cumsum() self.assert_(isinstance(result, SparseDataFrame)) assert_frame_equal(result.to_dense(), expected) def _check_all(self, check_func): check_func(self.frame) check_func(self.iframe) check_func(self.zframe) check_func(self.fill_frame)