def test_from_scipy_fillna(spmatrix): # GH 16112 arr = np.eye(3) arr[1:, 0] = np.nan try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = SparseDataFrame(spm).fillna(-1.0) # Returning frame should fill all nan values with -1.0 expected = SparseDataFrame( { 0: SparseSeries([1., -1, -1]), 1: SparseSeries([np.nan, 1, np.nan]), 2: SparseSeries([np.nan, np.nan, 1]), }, default_fill_value=-1) # fill_value is expected to be what .fillna() above was called with # We don't use -1 as initial fill_value in expected SparseSeries # construction because this way we obtain "compressed" SparseArrays, # avoiding having to construct them ourselves for col in expected: expected[col].fill_value = -1 tm.assert_sp_frame_equal(sdf, expected)
def test_apply_keep_sparse_dtype(): # GH 23744 sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), columns=['b', 'a', 'c'], default_fill_value=1) df = DataFrame(sdf) expected = sdf.apply(np.exp) result = df.apply(np.exp) tm.assert_frame_equal(expected, result)
def test_to_csv_sparse_dataframe(self, fill_value): # GH19384 sdf = SparseDataFrame({'a': type(self).fill_values}, default_fill_value=fill_value) with tm.ensure_clean('sparse_df.csv') as path: sdf.to_csv(path, index=False) df = read_csv(path, skip_blank_lines=False) tm.assert_sp_frame_equal(df.to_sparse(fill_value=fill_value), sdf)
def float_frame_fill0(): """ Fixture for sparse DataFrame of floats with DatetimeIndex Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 0 """ values = SparseDataFrame(data).values values[np.isnan(values)] = 0 return SparseDataFrame( values, columns=["A", "B", "C", "D"], default_fill_value=0, index=dates )
def test_where_with_numeric_data(data): # GH 17386 lower_bound = 1.5 sparse = SparseDataFrame(data) result = sparse.where(sparse > lower_bound) dense = DataFrame(data) dense_expected = dense.where(dense > lower_bound) sparse_expected = SparseDataFrame(dense_expected) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 sparse = SparseDataFrame(data) result = sparse.where(sparse > lower_bound, other) dense = DataFrame(data) dense_expected = dense.where(dense > lower_bound, other) sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def test_quantile(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] q = 0.1 sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected)
def test_where_with_bool_data(): # GH 17386 data = [[False, False], [True, True], [False, False]] cond = True sparse = SparseDataFrame(data) result = sparse.where(sparse == cond) dense = DataFrame(data) dense_expected = dense.where(dense == cond) sparse_expected = SparseDataFrame(dense_expected) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def test_where_with_bool_data_and_other(other): # GH 17386 data = [[False, False], [True, True], [False, False]] cond = True sparse = SparseDataFrame(data) result = sparse.where(sparse == cond, other) dense = DataFrame(data) dense_expected = dense.where(dense == cond, other) sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def test_quantile_multi(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] q = [0.1, 0.5] sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseDataFrame(dense_expected) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected)
def frame(dates): data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} return SparseDataFrame(data, index=dates)
def fill_frame(frame): values = frame.values.copy() values[np.isnan(values)] = 2 return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=2, index=frame.index)
def fill_frame(frame): values = frame.values.copy() values[np.isnan(values)] = 2 return SparseDataFrame( values, columns=["A", "B", "C", "D"], default_fill_value=2, index=frame.index )
def test_from_scipy_correct_ordering(spmatrix): # GH 16179 arr = np.arange(1, 5).reshape(2, 2) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = SparseDataFrame(spm) expected = SparseDataFrame(arr) tm.assert_sp_frame_equal(sdf, expected) tm.assert_frame_equal(sdf.to_dense(), expected.to_dense())
def float_frame(): """ Fixture for sparse DataFrame of floats with DatetimeIndex Columns are ['A', 'B', 'C', 'D']; some entries are missing """ # default_kind='block' is the default return SparseDataFrame(data, index=dates, default_kind='block')
def float_frame_int_kind(): """ Fixture for sparse DataFrame of floats with DatetimeIndex Columns are ['A', 'B', 'C', 'D'] and default_kind='integer'. Some entries are missing. """ return SparseDataFrame(data, index=dates, default_kind='integer')
def coo_to_df(triplets): """ Create a SparseDataFrame from a sequence of (row,col,value) triplets. """ data = defaultdict(dict) for row, col, val in triplets: data[col][row] = val return SparseDataFrame(data)
def float_frame_fill2_dense(): """ Fixture for dense DataFrame of floats with DatetimeIndex Columns are ['A', 'B', 'C', 'D']; missing entries have been filled with 2 """ values = SparseDataFrame(data).values values[np.isnan(values)] = 2 return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates)
def float_string_frame(): """ Fixture for sparse DataFrame of floats and strings with DatetimeIndex Columns are ['A', 'B', 'C', 'D', 'foo']; some entries are missing """ sdf = SparseDataFrame(data, index=dates) sdf['foo'] = SparseArray(['bar'] * len(dates)) return sdf
def frame(dates): data = { "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], "C": np.arange(10, dtype=np.float64), "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], } return SparseDataFrame(data, index=dates)
def _create_sp_frame(): nan = np.nan data = {u'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], u'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], u'C': np.arange(10).astype(np.int64), u'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} dates = bdate_range('1/1/2011', periods=10) return SparseDataFrame(data, index=dates)
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # GH 4343 # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results arr = np.eye(3, dtype=dtype) # GH 16179 arr[0, 1] = dtype(2) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = SparseDataFrame(spm, index=index, columns=columns, default_fill_value=fill_value) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic # and except later on rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( fill_value if fill_value is not None else np.nan) # Assert frame is as expected sdf_obj = sdf.astype(object) tm.assert_sp_frame_equal(sdf_obj, expected) tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) # Assert spmatrices equal assert dict(sdf.to_coo().todok()) == dict(spm.todok()) # Ensure dtype is preserved if possible was_upcast = ((fill_value is None or is_float(fill_value)) and not is_object_dtype(dtype) and not is_float_dtype(dtype)) res_dtype = (bool if is_bool_dtype(dtype) else float if was_upcast else dtype) tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) assert sdf.to_coo().dtype == res_dtype # However, adding a str column results in an upcast to object sdf['strings'] = np.arange(len(sdf)).astype(str) assert sdf.to_coo().dtype == np.object_
def _create_sp_frame(): nan = np.nan data = { "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], "C": np.arange(10).astype(np.int64), "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], } dates = bdate_range("1/1/2011", periods=10) return SparseDataFrame(data, index=dates)
def _create_sp_frame(): import numpy as np from pandas import bdate_range, SparseDataFrame nan = np.nan data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10).astype(np.int64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} dates = bdate_range('1/1/2011', periods=10) return SparseDataFrame(data, index=dates)
def test_from_to_scipy_object(spmatrix, fill_value): # GH 4343 dtype = object columns = list('cd') index = list('ab') if (spmatrix is scipy.sparse.dok_matrix and LooseVersion(scipy.__version__) >= LooseVersion('0.19.0')): pytest.skip("dok_matrix from object does not work in SciPy >= 0.19") # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results arr = np.eye(2, dtype=dtype) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = SparseDataFrame(spm, index=index, columns=columns, default_fill_value=fill_value) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic # and except later on rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( fill_value if fill_value is not None else np.nan) # Assert frame is as expected sdf_obj = sdf.astype(object) tm.assert_sp_frame_equal(sdf_obj, expected) tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) # Assert spmatrices equal with catch_warnings(record=True): assert dict(sdf.to_coo().todok()) == dict(spm.todok()) # Ensure dtype is preserved if possible res_dtype = object tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) assert sdf.to_coo().dtype == res_dtype
def test_from_to_scipy_object(spmatrix, fill_value): # GH 4343 dtype = object columns = list('cd') index = list('ab') if (spmatrix is scipy.sparse.dok_matrix and LooseVersion( scipy.__version__) >= LooseVersion('0.19.0')): pytest.skip("dok_matrix from object does not work in SciPy >= 0.19") # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results arr = np.eye(2, dtype=dtype) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = SparseDataFrame(spm, index=index, columns=columns, default_fill_value=fill_value) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic # and except later on rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( fill_value if fill_value is not None else np.nan) # Assert frame is as expected sdf_obj = sdf.astype(SparseDtype(object, fill_value)) tm.assert_sp_frame_equal(sdf_obj, expected) tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) # Assert spmatrices equal assert dict(sdf.to_coo().todok()) == dict(spm.todok()) # Ensure dtype is preserved if possible res_dtype = object tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)}) assert sdf.to_coo().dtype == res_dtype
def time_from_dict(self): SparseDataFrame(self.dict)
def time_from_scipy(self): SparseDataFrame(self.sparse)
def time_constructor(self): SparseDataFrame(columns=self.arr, index=self.arr)
def time_series_to_frame(self): SparseDataFrame(self.series)
def empty(): return SparseDataFrame()