def _make_index(length, indices, kind): if kind == 'block' or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) elif kind == 'integer' or isinstance(kind, IntIndex): index = IntIndex(length, indices) else: # pragma: no cover raise ValueError('must be block or integer type') return index
def test_constructor_spindex_dtype_scalar_broadcasts(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0
def from_spmatrix(cls, data, index=None, columns=None): """ Create a new DataFrame from a scipy sparse matrix. .. versionadded:: 0.25.0 Parameters ---------- data : scipy.sparse.spmatrix Must be convertible to csc format. index, columns : Index, optional Row and column labels to use for the resulting DataFrame. Defaults to a RangeIndex. Returns ------- DataFrame Each column of the DataFrame is stored as a :class:`arrays.SparseArray`. Examples -------- >>> import scipy.sparse >>> mat = scipy.sparse.eye(3) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ from pandas._libs.sparse import IntIndex from pandas import DataFrame data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape # We need to make sure indices are sorted, as we create # IntIndex with no input validation (i.e. check_integrity=False ). # Indices may already be sorted in scipy in which case this adds # a small overhead. data.sort_indices() indices = data.indices indptr = data.indptr array_data = data.data dtype = SparseDtype(array_data.dtype, 0) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) idx = IntIndex(n_rows, indices[sl], check_integrity=False) arr = SparseArray._simple_new(array_data[sl], idx, dtype) arrays.append(arr) return DataFrame._from_arrays(arrays, columns=columns, index=index, verify_integrity=False)
def make_sparse_index(length, indices, kind): if kind == "block" or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) elif kind == "integer" or isinstance(kind, IntIndex): index = IntIndex(length, indices) else: # pragma: no cover raise ValueError("must be block or integer type") return index
def test_setting_fill_value_updates(): arr = SparseArray([0.0, np.nan], fill_value=0) arr.fill_value = np.nan # use private constructor to get the index right # otherwise both nans would be un-stored. expected = SparseArray._simple_new( sparse_array=np.array([np.nan]), sparse_index=IntIndex(2, [1]), dtype=SparseDtype(float, np.nan), ) tm.assert_sp_array_equal(arr, expected)
def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # TODO: actionable? # XXX: Behavior change: specifying SparseIndex no longer changes the # fill_value expected = SparseArray([0, 1, 2, 0], kind="integer") tm.assert_sp_array_equal(arr, expected) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray( data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=np.int64, fill_value=0, ) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray( data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=None, fill_value=0, ) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0
def _concat_same_type( cls: Type[SparseArrayT], to_concat: Sequence[SparseArrayT] ) -> SparseArrayT: fill_value = to_concat[0].fill_value values = [] length = 0 if to_concat: sp_kind = to_concat[0].kind else: sp_kind = "integer" if sp_kind == "integer": indices = [] for arr in to_concat: idx = arr.sp_index.to_int_index().indices.copy() idx += length # TODO: wraparound length += arr.sp_index.length values.append(arr.sp_values) indices.append(idx) data = np.concatenate(values) indices = np.concatenate(indices) sp_index = IntIndex(length, indices) else: # when concatenating block indices, we don't claim that you'll # get an identical index as concatenating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting # BlockIndex may have more blocks. blengths = [] blocs = [] for arr in to_concat: idx = arr.sp_index.to_block_index() values.append(arr.sp_values) blocs.append(idx.blocs.copy() + length) blengths.append(idx.blengths) length += arr.sp_index.length data = np.concatenate(values) blocs = np.concatenate(blocs) blengths = np.concatenate(blengths) sp_index = BlockIndex(length, blocs, blengths) return cls(data, sparse_index=sp_index, fill_value=fill_value)
def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0
def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32')) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("float64", fill_value=0) result = arr.astype(dtype) expected = SparseArray._simple_new( np.array([0., 2.], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("int64", 0) result = arr.astype(dtype) expected = SparseArray._simple_new(np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype) tm.assert_sp_array_equal(result, expected) arr = SparseArray([0, np.nan, 0, 1], fill_value=0) with pytest.raises(ValueError, match='NA'): arr.astype('Sparse[i8]')
def from_spmatrix(cls, data): """ Create a SparseArray from a scipy.sparse matrix. .. versionadded:: 0.25.0 Parameters ---------- data : scipy.sparse.sp_matrix This should be a SciPy sparse matrix where the size of the second dimension is 1. In other words, a sparse matrix with a single column. Returns ------- SparseArray Examples -------- >>> import scipy.sparse >>> mat = scipy.sparse.coo_matrix((4, 1)) >>> pd.arrays.SparseArray.from_spmatrix(mat) [0.0, 0.0, 0.0, 0.0] Fill: 0.0 IntIndex Indices: array([], dtype=int32) """ length, ncol = data.shape if ncol != 1: raise ValueError(f"'data' must have a single column, not '{ncol}'") # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. arr = data.data idx, _ = data.nonzero() loc = np.argsort(idx) arr = arr.take(loc) idx.sort() zero = np.array(0, dtype=arr.dtype).item() dtype = SparseDtype(arr.dtype, zero) index = IntIndex(length, idx) return cls._simple_new(arr, index, dtype)
def _patch_from_spmatrix(cls, data): # -no-cov- length, ncol = data.shape if ncol != 1: raise ValueError("'data' must have a single column, not '{}'".format(ncol)) # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. arr = data.data #idx, _ = data.nonzero() idx = data.indices loc = np.argsort(idx) arr = arr.take(loc) idx.sort() zero = np.array(0, dtype=arr.dtype).item() dtype = pd.SparseDtype(arr.dtype, zero) index = IntIndex(length, idx) return cls._simple_new(arr, index, dtype)
def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) assert arr.dtype == np.float64 assert np.isnan(arr.fill_value) arr = SparseArray(data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=np.int64, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == np.int64 assert arr.fill_value == 0 arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == np.int64 assert arr.fill_value == 0 arr = SparseArray(data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=None, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == np.int64 assert arr.fill_value == 0 # scalar input arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == np.int64 assert arr.fill_value == 0 arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == np.int64 assert arr.fill_value == 0
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na: bool = False, sparse: bool = False, drop_first: bool = False, dtype: Dtype | None = None, ) -> DataFrame: from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.dtype(np.uint8) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) index: Index | None if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 elif dtype == np.dtype(bool): fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) return concat(sparse_series, axis=1, copy=False) else: # take on axis=1 + transpose to ensure ndarray layout is column-major dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _create_sparse_df( data: Union[np.ndarray, spmatrix], index: Optional[pd.Index] = None, columns: Optional[Sequence[Any]] = None, fill_value: float = 0, ) -> pd.DataFrame: """ Create a new DataFrame from a scipy sparse matrix or numpy array. This is the original :mod:`pandas` implementation with 2 differences: - allow creation also from :class:`numpy.ndarray` - expose ``fill_values`` Parameters ---------- data Must be convertible to CSC format. index Row labels to use. columns Column labels to use. Returns ------- Each column of the DataFrame is stored as a :class:`arrays.SparseArray`. """ from pandas._libs.sparse import IntIndex from pandas.core.arrays.sparse.accessor import ( SparseArray, SparseDtype, SparseFrameAccessor, ) if not issparse(data): data = csc_matrix(data) sort_indices = False else: if TYPE_CHECKING: assert isinstance(data, spmatrix) data = data.tocsc() sort_indices = True data = data.tocsc() index, columns = SparseFrameAccessor._prep_index(data, index, columns) n_rows, n_columns = data.shape # We need to make sure indices are sorted, as we create # IntIndex with no input validation (i.e. check_integrity=False ). # Indices may already be sorted in scipy in which case this adds # a small overhead. if sort_indices: data.sort_indices() indices = data.indices indptr = data.indptr array_data = data.data dtype = SparseDtype(array_data.dtype, fill_value=fill_value) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) idx = IntIndex(n_rows, indices[sl], check_integrity=False) arr = SparseArray._simple_new(array_data[sl], idx, dtype) arrays.append(arr) return pd.DataFrame._from_arrays(arrays, columns=columns, index=index, verify_integrity=False)
class TestSparseArray(object): def setup_method(self, method): self.arr_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) self.arr = SparseArray(self.arr_data) self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) def test_constructor_dtype(self): arr = SparseArray([np.nan, 1, 2, np.nan]) assert arr.dtype == SparseDtype(np.float64, np.nan) assert arr.dtype.subtype == np.float64 assert np.isnan(arr.fill_value) arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) assert arr.dtype == SparseDtype(np.float64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], dtype=np.float64) assert arr.dtype == SparseDtype(np.float64, np.nan) assert np.isnan(arr.fill_value) arr = SparseArray([0, 1, 2, 4], dtype=np.int64) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], dtype=None) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 def test_constructor_dtype_str(self): result = SparseArray([1, 2, 3], dtype='int') expected = SparseArray([1, 2, 3], dtype=int) tm.assert_sp_array_equal(result, expected) def test_constructor_sparse_dtype(self): result = SparseArray([1, 0, 0, 1], dtype=SparseDtype('int64', -1)) expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) tm.assert_sp_array_equal(result, expected) assert result.sp_values.dtype == np.dtype('int64') def test_constructor_sparse_dtype_str(self): result = SparseArray([1, 0, 0, 1], dtype='Sparse[int32]') expected = SparseArray([1, 0, 0, 1], dtype=np.int32) tm.assert_sp_array_equal(result, expected) assert result.sp_values.dtype == np.dtype('int32') def test_constructor_object_dtype(self): # GH 11856 arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object) assert arr.dtype == SparseDtype(np.object) assert np.isnan(arr.fill_value) arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object, fill_value='A') assert arr.dtype == SparseDtype(np.object, 'A') assert arr.fill_value == 'A' # GH 17574 data = [False, 0, 100.0, 0.0] arr = SparseArray(data, dtype=np.object, fill_value=False) assert arr.dtype == SparseDtype(np.object, False) assert arr.fill_value is False arr_expected = np.array(data, dtype=np.object) it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) assert np.fromiter(it, dtype=np.bool).all() @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) def test_constructor_na_dtype(self, dtype): with pytest.raises(ValueError, match="Cannot convert"): SparseArray([0, 1, np.nan], dtype=dtype) def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the # fill_value expected = SparseArray([0, 1, 2, 0], kind='integer') tm.assert_sp_array_equal(arr, expected) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=np.int64, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=None, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 @pytest.mark.parametrize("sparse_index", [ None, IntIndex(1, [0]), ]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 def test_constructor_spindex_dtype_scalar_broadcasts(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 @pytest.mark.parametrize('data, fill_value', [ (np.array([1, 2]), 0), (np.array([1.0, 2.0]), np.nan), ([True, False], False), ([pd.Timestamp('2017-01-01')], pd.NaT), ]) def test_constructor_inferred_fill_value(self, data, fill_value): result = SparseArray(data).fill_value if pd.isna(fill_value): assert pd.isna(result) else: assert result == fill_value @pytest.mark.parametrize('scalar,dtype', [(False, SparseDtype(bool, False)), (0.0, SparseDtype('float64', 0)), (1, SparseDtype('int64', 1)), ('z', SparseDtype('object', 'z'))]) def test_scalar_with_index_infer_dtype(self, scalar, dtype): # GH 19163 arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == dtype assert exp.dtype == dtype @pytest.mark.parametrize("fill", [1, np.nan, 0]) def test_sparse_series_round_trip(self, kind, fill): # see gh-13999 arr = SparseArray([np.nan, 1, np.nan, 2, 3], kind=kind, fill_value=fill) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) arr = SparseArray([0, 0, 0, 1, 1, 2], dtype=np.int64, kind=kind, fill_value=fill) res = SparseArray(SparseSeries(arr), dtype=np.int64) tm.assert_sp_array_equal(arr, res) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) @pytest.mark.parametrize("fill", [True, False, np.nan]) def test_sparse_series_round_trip2(self, kind, fill): # see gh-13999 arr = SparseArray([True, False, True, True], dtype=np.bool, kind=kind, fill_value=fill) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) def test_get_item(self): assert np.isnan(self.arr[1]) assert self.arr[2] == 1 assert self.arr[7] == 5 assert self.zarr[0] == 0 assert self.zarr[2] == 1 assert self.zarr[7] == 5 errmsg = re.compile("bounds") with pytest.raises(IndexError, match=errmsg): self.arr[11] with pytest.raises(IndexError, match=errmsg): self.arr[-11] assert self.arr[-1] == self.arr[len(self.arr) - 1] def test_take_scalar_raises(self): msg = "'indices' must be an array, not a scalar '2'." with pytest.raises(ValueError, match=msg): self.arr.take(2) def test_take(self): exp = SparseArray(np.take(self.arr_data, [2, 3])) tm.assert_sp_array_equal(self.arr.take([2, 3]), exp) exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) sparse = SparseArray(data, fill_value=0) exp = SparseArray(np.take(data, [0]), fill_value=0) tm.assert_sp_array_equal(sparse.take([0]), exp) exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) def test_take_negative(self): exp = SparseArray(np.take(self.arr_data, [-1])) tm.assert_sp_array_equal(self.arr.take([-1]), exp) exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) def test_bad_take(self): with pytest.raises(IndexError, match="bounds"): self.arr.take([11]) def test_take_filling(self): # similar tests as GH 12631 sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) result = sparse.take(np.array([1, 0, -1])) expected = SparseArray([np.nan, np.nan, 4]) tm.assert_sp_array_equal(result, expected) # XXX: test change: fill_value=True -> allow_fill=True result = sparse.take(np.array([1, 0, -1]), allow_fill=True) expected = SparseArray([np.nan, np.nan, np.nan]) tm.assert_sp_array_equal(result, expected) # allow_fill=False result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([np.nan, np.nan, 4]) tm.assert_sp_array_equal(result, expected) msg = "Invalid value in 'indices'" with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -2]), allow_fill=True) with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -5]), allow_fill=True) with pytest.raises(IndexError): sparse.take(np.array([1, -6])) with pytest.raises(IndexError): sparse.take(np.array([1, 5])) with pytest.raises(IndexError): sparse.take(np.array([1, 5]), allow_fill=True) def test_take_filling_fill_value(self): # same tests as GH 12631 sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) result = sparse.take(np.array([1, 0, -1])) expected = SparseArray([0, np.nan, 4], fill_value=0) tm.assert_sp_array_equal(result, expected) # fill_value result = sparse.take(np.array([1, 0, -1]), allow_fill=True) # XXX: behavior change. # the old way of filling self.fill_value doesn't follow EA rules. # It's supposed to be self.dtype.na_value (nan in this case) expected = SparseArray([0, np.nan, np.nan], fill_value=0) tm.assert_sp_array_equal(result, expected) # allow_fill=False result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([0, np.nan, 4], fill_value=0) tm.assert_sp_array_equal(result, expected) msg = ("Invalid value in 'indices'.") with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -2]), allow_fill=True) with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -5]), allow_fill=True) with pytest.raises(IndexError): sparse.take(np.array([1, -6])) with pytest.raises(IndexError): sparse.take(np.array([1, 5])) with pytest.raises(IndexError): sparse.take(np.array([1, 5]), fill_value=True) def test_take_filling_all_nan(self): sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) # XXX: did the default kind from take change? result = sparse.take(np.array([1, 0, -1])) expected = SparseArray([np.nan, np.nan, np.nan], kind='block') tm.assert_sp_array_equal(result, expected) result = sparse.take(np.array([1, 0, -1]), fill_value=True) expected = SparseArray([np.nan, np.nan, np.nan], kind='block') tm.assert_sp_array_equal(result, expected) with pytest.raises(IndexError): sparse.take(np.array([1, -6])) with pytest.raises(IndexError): sparse.take(np.array([1, 5])) with pytest.raises(IndexError): sparse.take(np.array([1, 5]), fill_value=True) def test_set_item(self): def setitem(): self.arr[5] = 3 def setslice(): self.arr[1:5] = 2 with pytest.raises(TypeError, match="item assignment"): setitem() with pytest.raises(TypeError, match="item assignment"): setslice() def test_constructor_from_too_large_array(self): with pytest.raises(TypeError, match="expected dimension <= 1 data"): SparseArray(np.arange(10).reshape((2, 5))) def test_constructor_from_sparse(self): res = SparseArray(self.zarr) assert res.fill_value == 0 assert_almost_equal(res.sp_values, self.zarr.sp_values) def test_constructor_copy(self): cp = SparseArray(self.arr, copy=True) cp.sp_values[:3] = 0 assert not (self.arr.sp_values[:3] == 0).any() not_copy = SparseArray(self.arr) not_copy.sp_values[:3] = 0 assert (self.arr.sp_values[:3] == 0).all() def test_constructor_bool(self): # GH 10648 data = np.array([False, False, True, True, False, False]) arr = SparseArray(data, fill_value=False, dtype=bool) assert arr.dtype == SparseDtype(bool) tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) for dense in [arr.to_dense(), arr.values]: assert dense.dtype == bool tm.assert_numpy_array_equal(dense, data) def test_constructor_bool_fill_value(self): arr = SparseArray([True, False, True], dtype=None) assert arr.dtype == SparseDtype(np.bool) assert not arr.fill_value arr = SparseArray([True, False, True], dtype=np.bool) assert arr.dtype == SparseDtype(np.bool) assert not arr.fill_value arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True) assert arr.dtype == SparseDtype(np.bool, True) assert arr.fill_value def test_constructor_float32(self): # GH 10648 data = np.array([1., np.nan, 3], dtype=np.float32) arr = SparseArray(data, dtype=np.float32) assert arr.dtype == SparseDtype(np.float32) tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([0, 2], dtype=np.int32)) for dense in [arr.to_dense(), arr.values]: assert dense.dtype == np.float32 tm.assert_numpy_array_equal(dense, data) def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32')) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("float64", fill_value=0) result = arr.astype(dtype) expected = SparseArray._simple_new( np.array([0., 2.], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("int64", 0) result = arr.astype(dtype) expected = SparseArray._simple_new(np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype) tm.assert_sp_array_equal(result, expected) arr = SparseArray([0, np.nan, 0, 1], fill_value=0) with pytest.raises(ValueError, match='NA'): arr.astype('Sparse[i8]') def test_astype_bool(self): a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) # update fill value result = a.astype(SparseDtype(bool, False)) expected = SparseArray([True, False, False, True], dtype=SparseDtype(bool, False)) tm.assert_sp_array_equal(result, expected) def test_astype_all(self, any_real_dtype): vals = np.array([1, 2, 3]) arr = SparseArray(vals, fill_value=1) typ = np.dtype(any_real_dtype) res = arr.astype(typ) assert res.dtype == SparseDtype(typ, 1) assert res.sp_values.dtype == typ tm.assert_numpy_array_equal(np.asarray(res.values), vals.astype(typ)) def test_set_fill_value(self): arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) arr.fill_value = 2 assert arr.fill_value == 2 arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) arr.fill_value = 2 assert arr.fill_value == 2 # XXX: this seems fine? You can construct an integer # sparsearray with NaN fill value, why not update one? # coerces to int # msg = "unable to set fill_value 3\\.1 to int64 dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = 3.1 assert arr.fill_value == 3.1 # msg = "unable to set fill_value nan to int64 dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = np.nan assert np.isnan(arr.fill_value) arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) arr.fill_value = True assert arr.fill_value # coerces to bool # msg = "unable to set fill_value 0 to bool dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = 0 assert arr.fill_value == 0 # msg = "unable to set fill_value nan to bool dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = np.nan assert np.isnan(arr.fill_value) @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]) def test_set_fill_invalid_non_scalar(self, val): arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) msg = "fill_value must be a scalar" with pytest.raises(ValueError, match=msg): arr.fill_value = val def test_copy_shallow(self): arr2 = self.arr.copy(deep=False) assert arr2.sp_values is self.arr.sp_values assert arr2.sp_index is self.arr.sp_index def test_values_asarray(self): assert_almost_equal(self.arr.values, self.arr_data) assert_almost_equal(self.arr.to_dense(), self.arr_data) @pytest.mark.parametrize('data,shape,dtype', [([0, 0, 0, 0, 0], (5, ), None), ([], (0, ), None), ([0], (1, ), None), (['A', 'A', np.nan, 'B'], (4, ), np.object)]) def test_shape(self, data, shape, dtype): # GH 21126 out = SparseArray(data, dtype=dtype) assert out.shape == shape @pytest.mark.parametrize("vals", [ [np.nan, np.nan, np.nan, np.nan, np.nan], [1, np.nan, np.nan, 3, np.nan], [1, np.nan, 0, 3, 0], ]) @pytest.mark.parametrize("method", ["to_dense", "get_values"]) @pytest.mark.parametrize("fill_value", [None, 0]) def test_dense_repr(self, vals, fill_value, method): vals = np.array(vals) arr = SparseArray(vals, fill_value=fill_value) dense_func = getattr(arr, method) res = dense_func() tm.assert_numpy_array_equal(res, vals) def test_getitem(self): def _checkit(i): assert_almost_equal(self.arr[i], self.arr.values[i]) for i in range(len(self.arr)): _checkit(i) _checkit(-i) def test_getitem_arraylike_mask(self): arr = SparseArray([0, 1, 2]) result = arr[[True, False, True]] expected = SparseArray([0, 2]) tm.assert_sp_array_equal(result, expected) def test_getslice(self): result = self.arr[:-3] exp = SparseArray(self.arr.values[:-3]) tm.assert_sp_array_equal(result, exp) result = self.arr[-4:] exp = SparseArray(self.arr.values[-4:]) tm.assert_sp_array_equal(result, exp) # two corner cases from Series result = self.arr[-12:] exp = SparseArray(self.arr) tm.assert_sp_array_equal(result, exp) result = self.arr[:-12] exp = SparseArray(self.arr.values[:0]) tm.assert_sp_array_equal(result, exp) def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) res = sparse[4:, ] exp = SparseArray(dense[4:, ]) tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) res = sparse[4:, ] exp = SparseArray(dense[4:, ], fill_value=0) tm.assert_sp_array_equal(res, exp) with pytest.raises(IndexError): sparse[4:, :] with pytest.raises(IndexError): # check numpy compat dense[4:, :] def test_boolean_slice_empty(self): arr = pd.SparseArray([0, 1, 2]) res = arr[[False, False, False]] assert res.dtype == arr.dtype @pytest.mark.parametrize( "op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) def test_binary_operators(self, op): op = getattr(operator, op) data1 = np.random.randn(20) data2 = np.random.randn(20) data1[::2] = np.nan data2[::3] = np.nan arr1 = SparseArray(data1) arr2 = SparseArray(data2) data1[::2] = 3 data2[::3] = 3 farr1 = SparseArray(data1, fill_value=3) farr2 = SparseArray(data2, fill_value=3) def _check_op(op, first, second): res = op(first, second) exp = SparseArray(op(first.values, second.values), fill_value=first.fill_value) assert isinstance(res, SparseArray) assert_almost_equal(res.values, exp.values) res2 = op(first, second.values) assert isinstance(res2, SparseArray) tm.assert_sp_array_equal(res, res2) res3 = op(first.values, second) assert isinstance(res3, SparseArray) tm.assert_sp_array_equal(res, res3) res4 = op(first, 4) assert isinstance(res4, SparseArray) # Ignore this if the actual op raises (e.g. pow). try: exp = op(first.values, 4) exp_fv = op(first.fill_value, 4) except ValueError: pass else: assert_almost_equal(res4.fill_value, exp_fv) assert_almost_equal(res4.values, exp) with np.errstate(all="ignore"): for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]: _check_op(op, first_arr, second_arr) def test_pickle(self): def _check_roundtrip(obj): unpickled = tm.round_trip_pickle(obj) tm.assert_sp_array_equal(unpickled, obj) _check_roundtrip(self.arr) _check_roundtrip(self.zarr) def test_generator_warnings(self): sp_arr = SparseArray([1, 2, 3]) with warnings.catch_warnings(record=True) as w: warnings.filterwarnings(action='always', category=DeprecationWarning) warnings.filterwarnings(action='always', category=PendingDeprecationWarning) for _ in sp_arr: pass assert len(w) == 0 def test_fillna(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) res = s.fillna(-1) exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(-1) exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0]) res = s.fillna(-1) exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) res = s.fillna(-1) exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan]) res = s.fillna(-1) exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) res = s.fillna(-1) exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) # float dtype's fill_value is np.nan, replaced by -1 s = SparseArray([0., 0., 0., 0.]) res = s.fillna(-1) exp = SparseArray([0., 0., 0., 0.], fill_value=-1) tm.assert_sp_array_equal(res, exp) # int dtype shouldn't have missing. No changes. s = SparseArray([0, 0, 0, 0]) assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 res = s.fillna(-1) tm.assert_sp_array_equal(res, s) s = SparseArray([0, 0, 0, 0], fill_value=0) assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=0) tm.assert_sp_array_equal(res, exp) # fill_value can be nan if there is no missing hole. # only fill_value will be changed s = SparseArray([0, 0, 0, 0], fill_value=np.nan) assert s.dtype == SparseDtype(np.int64, fill_value=np.nan) assert np.isnan(s.fill_value) res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=-1) tm.assert_sp_array_equal(res, exp) def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) # filling with existing value doesn't replace existing value with # fill_value, i.e. existing 3 remains in sp_values res = s.fillna(3) exp = np.array([1, 3, 3, 3, 3], dtype=np.float64) tm.assert_numpy_array_equal(res.to_dense(), exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(3) exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _sparse_reindex(cls, inp, index=None, columns=None): if inp.ndim == 2: columns = inp.columns if columns is None else columns index_shape = len(index) if index is not None else len(inp) i_to_columns = dict() for i, col in enumerate(columns): if col in inp.dtypes: if index is None: i_to_columns[i] = inp[col] else: indexer = inp.index.reindex(index)[1] cond = indexer >= 0 available_indexer = indexer[cond] del indexer data = inp[col].iloc[available_indexer].to_numpy() ind = cond.nonzero()[0] spmatrix = sps.csc_matrix( (data, (ind, np.zeros_like(ind))), shape=(index_shape, 1), dtype=inp[col].dtype, ) # convert to SparseDtype(xxx, np.nan) # to ensure 0 in sparse_array not converted to np.nan if not _pd_sparse_miss_zero: sparse_array = pd.arrays.SparseArray.from_spmatrix( spmatrix) sparse_array = pd.arrays.SparseArray( sparse_array.sp_values, sparse_index=sparse_array.sp_index, fill_value=np.nan, dtype=pd.SparseDtype(sparse_array.dtype, np.nan), ) else: from pandas._libs.sparse import IntIndex sparse_array = pd.arrays.SparseArray( data, sparse_index=IntIndex(index_shape, ind), fill_value=np.nan, dtype=pd.SparseDtype(data.dtype, np.nan), ) series = pd.Series(sparse_array, index=index) i_to_columns[i] = series else: ind = index if index is not None else inp.index i_to_columns[i] = pd.DataFrame.sparse.from_spmatrix( sps.coo_matrix((index_shape, 1), dtype=np.float64), index=ind).iloc[:, 0] df = pd.DataFrame(i_to_columns) df.columns = columns return df else: indexer = inp.index.reindex(index)[1] cond = indexer >= 0 available_indexer = indexer[cond] del indexer data = inp.iloc[available_indexer].to_numpy() ind = cond.nonzero()[0] spmatrix = sps.csc_matrix( (data, (ind, np.zeros_like(ind))), shape=(len(index), 1), dtype=inp.dtype, ) sparse_array = pd.arrays.SparseArray.from_spmatrix(spmatrix) # convert to SparseDtype(xxx, np.nan) # to ensure 0 in sparse_array not converted to np.nan sparse_array = pd.arrays.SparseArray( sparse_array.sp_values, sparse_index=sparse_array.sp_index, fill_value=np.nan, dtype=pd.SparseDtype(sparse_array.dtype, np.nan), ) series = pd.Series(sparse_array, index=index, name=inp.name) return series
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): if index is None: index = data.index.view() data = Series(data) res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() else: data = data.reindex(index, copy=False) else: length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
class TestConstructors: def test_constructor_dtype(self): arr = SparseArray([np.nan, 1, 2, np.nan]) assert arr.dtype == SparseDtype(np.float64, np.nan) assert arr.dtype.subtype == np.float64 assert np.isnan(arr.fill_value) arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) assert arr.dtype == SparseDtype(np.float64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], dtype=np.float64) assert arr.dtype == SparseDtype(np.float64, np.nan) assert np.isnan(arr.fill_value) arr = SparseArray([0, 1, 2, 4], dtype=np.int64) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], dtype=None) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 def test_constructor_dtype_str(self): result = SparseArray([1, 2, 3], dtype="int") expected = SparseArray([1, 2, 3], dtype=int) tm.assert_sp_array_equal(result, expected) def test_constructor_sparse_dtype(self): result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) tm.assert_sp_array_equal(result, expected) assert result.sp_values.dtype == np.dtype("int64") def test_constructor_sparse_dtype_str(self): result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") expected = SparseArray([1, 0, 0, 1], dtype=np.int32) tm.assert_sp_array_equal(result, expected) assert result.sp_values.dtype == np.dtype("int32") def test_constructor_object_dtype(self): # GH#11856 arr = SparseArray(["A", "A", np.nan, "B"], dtype=object) assert arr.dtype == SparseDtype(object) assert np.isnan(arr.fill_value) arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A") assert arr.dtype == SparseDtype(object, "A") assert arr.fill_value == "A" # GH#17574 data = [False, 0, 100.0, 0.0] arr = SparseArray(data, dtype=object, fill_value=False) assert arr.dtype == SparseDtype(object, False) assert arr.fill_value is False arr_expected = np.array(data, dtype=object) it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) assert np.fromiter(it, dtype=np.bool_).all() @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) def test_constructor_na_dtype(self, dtype): with pytest.raises(ValueError, match="Cannot convert"): SparseArray([0, 1, np.nan], dtype=dtype) def test_constructor_warns_when_losing_timezone(self): # GH#32501 warn when losing timezone information dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]")) with tm.assert_produces_warning(UserWarning): result = SparseArray(dti) tm.assert_sp_array_equal(result, expected) with tm.assert_produces_warning(UserWarning): result = SparseArray(pd.Series(dti)) tm.assert_sp_array_equal(result, expected) def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # TODO: actionable? # XXX: Behavior change: specifying SparseIndex no longer changes the # fill_value expected = SparseArray([0, 1, 2, 0], kind="integer") tm.assert_sp_array_equal(arr, expected) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray( data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=np.int64, fill_value=0, ) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray( data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=None, fill_value=0, ) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 def test_constructor_spindex_dtype_scalar_broadcasts(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 @pytest.mark.parametrize( "data, fill_value", [ (np.array([1, 2]), 0), (np.array([1.0, 2.0]), np.nan), ([True, False], False), ([pd.Timestamp("2017-01-01")], pd.NaT), ], ) def test_constructor_inferred_fill_value(self, data, fill_value): result = SparseArray(data).fill_value if isna(fill_value): assert isna(result) else: assert result == fill_value @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) @pytest.mark.parametrize("size", [0, 10]) @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): import scipy.sparse mat = scipy.sparse.random(size, 1, density=0.5, format=format) result = SparseArray.from_spmatrix(mat) result = np.asarray(result) expected = mat.toarray().ravel() tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) @td.skip_if_no_scipy def test_from_spmatrix_including_explicit_zero(self, format): import scipy.sparse mat = scipy.sparse.random(10, 1, density=0.5, format=format) mat.data[0] = 0 result = SparseArray.from_spmatrix(mat) result = np.asarray(result) expected = mat.toarray().ravel() tm.assert_numpy_array_equal(result, expected) @td.skip_if_no_scipy def test_from_spmatrix_raises(self): import scipy.sparse mat = scipy.sparse.eye(5, 4, format="csc") with pytest.raises(ValueError, match="not '4'"): SparseArray.from_spmatrix(mat) @pytest.mark.parametrize( "scalar,dtype", [ (False, SparseDtype(bool, False)), (0.0, SparseDtype("float64", 0)), (1, SparseDtype("int64", 1)), ("z", SparseDtype("object", "z")), ], ) def test_scalar_with_index_infer_dtype(self, scalar, dtype): # GH#19163 with tm.assert_produces_warning( FutureWarning, match="The index argument has been deprecated"): arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == dtype assert exp.dtype == dtype def test_constructor_from_too_large_array(self): with pytest.raises(TypeError, match="expected dimension <= 1 data"): SparseArray(np.arange(10).reshape((2, 5))) def test_constructor_from_sparse(self): zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) res = SparseArray(zarr) assert res.fill_value == 0 tm.assert_almost_equal(res.sp_values, zarr.sp_values) def test_constructor_copy(self): arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) arr = SparseArray(arr_data) cp = SparseArray(arr, copy=True) cp.sp_values[:3] = 0 assert not (arr.sp_values[:3] == 0).any() not_copy = SparseArray(arr) not_copy.sp_values[:3] = 0 assert (arr.sp_values[:3] == 0).all() def test_constructor_bool(self): # GH#10648 data = np.array([False, False, True, True, False, False]) arr = SparseArray(data, fill_value=False, dtype=bool) assert arr.dtype == SparseDtype(bool) tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) dense = arr.to_dense() assert dense.dtype == bool tm.assert_numpy_array_equal(dense, data) def test_constructor_bool_fill_value(self): arr = SparseArray([True, False, True], dtype=None) assert arr.dtype == SparseDtype(np.bool_) assert not arr.fill_value arr = SparseArray([True, False, True], dtype=np.bool_) assert arr.dtype == SparseDtype(np.bool_) assert not arr.fill_value arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True) assert arr.dtype == SparseDtype(np.bool_, True) assert arr.fill_value def test_constructor_float32(self): # GH#10648 data = np.array([1.0, np.nan, 3], dtype=np.float32) arr = SparseArray(data, dtype=np.float32) assert arr.dtype == SparseDtype(np.float32) tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([0, 2], dtype=np.int32)) dense = arr.to_dense() assert dense.dtype == np.float32 tm.assert_numpy_array_equal(dense, data)
class TestSparseArray: def setup_method(self, method): self.arr_data = np.array( [np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) self.arr = SparseArray(self.arr_data) self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) def test_constructor_dtype(self): arr = SparseArray([np.nan, 1, 2, np.nan]) assert arr.dtype == SparseDtype(np.float64, np.nan) assert arr.dtype.subtype == np.float64 assert np.isnan(arr.fill_value) arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0) assert arr.dtype == SparseDtype(np.float64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], dtype=np.float64) assert arr.dtype == SparseDtype(np.float64, np.nan) assert np.isnan(arr.fill_value) arr = SparseArray([0, 1, 2, 4], dtype=np.int64) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], dtype=None) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) assert arr.dtype == SparseDtype(np.int64, 0) assert arr.fill_value == 0 def test_constructor_dtype_str(self): result = SparseArray([1, 2, 3], dtype="int") expected = SparseArray([1, 2, 3], dtype=int) tm.assert_sp_array_equal(result, expected) def test_constructor_sparse_dtype(self): result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) tm.assert_sp_array_equal(result, expected) assert result.sp_values.dtype == np.dtype("int64") def test_constructor_sparse_dtype_str(self): result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") expected = SparseArray([1, 0, 0, 1], dtype=np.int32) tm.assert_sp_array_equal(result, expected) assert result.sp_values.dtype == np.dtype("int32") def test_constructor_object_dtype(self): # GH 11856 arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object) assert arr.dtype == SparseDtype(np.object) assert np.isnan(arr.fill_value) arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object, fill_value="A") assert arr.dtype == SparseDtype(np.object, "A") assert arr.fill_value == "A" # GH 17574 data = [False, 0, 100.0, 0.0] arr = SparseArray(data, dtype=np.object, fill_value=False) assert arr.dtype == SparseDtype(np.object, False) assert arr.fill_value is False arr_expected = np.array(data, dtype=np.object) it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) assert np.fromiter(it, dtype=np.bool).all() @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) def test_constructor_na_dtype(self, dtype): with pytest.raises(ValueError, match="Cannot convert"): SparseArray([0, 1, np.nan], dtype=dtype) def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the # fill_value expected = SparseArray([0, 1, 2, 0], kind="integer") tm.assert_sp_array_equal(arr, expected) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray( data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=np.int64, fill_value=0, ) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray( data=[1, 2, 3], sparse_index=IntIndex(4, [1, 2, 3]), dtype=None, fill_value=0, ) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 def test_constructor_spindex_dtype_scalar_broadcasts(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 @pytest.mark.parametrize( "data, fill_value", [ (np.array([1, 2]), 0), (np.array([1.0, 2.0]), np.nan), ([True, False], False), ([pd.Timestamp("2017-01-01")], pd.NaT), ], ) def test_constructor_inferred_fill_value(self, data, fill_value): result = SparseArray(data).fill_value if pd.isna(fill_value): assert pd.isna(result) else: assert result == fill_value @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) @pytest.mark.parametrize( "size", [ pytest.param( 0, marks=td.skip_if_np_lt("1.16", reason="NumPy-11383")), 10 ], ) @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): import scipy.sparse mat = scipy.sparse.random(size, 1, density=0.5, format=format) result = SparseArray.from_spmatrix(mat) result = np.asarray(result) expected = mat.toarray().ravel() tm.assert_numpy_array_equal(result, expected) @td.skip_if_no_scipy def test_from_spmatrix_raises(self): import scipy.sparse mat = scipy.sparse.eye(5, 4, format="csc") with pytest.raises(ValueError, match="not '4'"): SparseArray.from_spmatrix(mat) @pytest.mark.parametrize( "scalar,dtype", [ (False, SparseDtype(bool, False)), (0.0, SparseDtype("float64", 0)), (1, SparseDtype("int64", 1)), ("z", SparseDtype("object", "z")), ], ) def test_scalar_with_index_infer_dtype(self, scalar, dtype): # GH 19163 arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) exp = SparseArray([scalar, scalar, scalar], fill_value=scalar) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == dtype assert exp.dtype == dtype def test_get_item(self): assert np.isnan(self.arr[1]) assert self.arr[2] == 1 assert self.arr[7] == 5 assert self.zarr[0] == 0 assert self.zarr[2] == 1 assert self.zarr[7] == 5 errmsg = re.compile("bounds") with pytest.raises(IndexError, match=errmsg): self.arr[11] with pytest.raises(IndexError, match=errmsg): self.arr[-11] assert self.arr[-1] == self.arr[len(self.arr) - 1] def test_take_scalar_raises(self): msg = "'indices' must be an array, not a scalar '2'." with pytest.raises(ValueError, match=msg): self.arr.take(2) def test_take(self): exp = SparseArray(np.take(self.arr_data, [2, 3])) tm.assert_sp_array_equal(self.arr.take([2, 3]), exp) exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) sparse = SparseArray(data, fill_value=0) exp = SparseArray(np.take(data, [0]), fill_value=0) tm.assert_sp_array_equal(sparse.take([0]), exp) exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) def test_take_negative(self): exp = SparseArray(np.take(self.arr_data, [-1])) tm.assert_sp_array_equal(self.arr.take([-1]), exp) exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) @pytest.mark.parametrize("fill_value", [0, None, np.nan]) def test_shift_fill_value(self, fill_value): # GH #24128 sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0) res = sparse.shift(1, fill_value=fill_value) if isna(fill_value): fill_value = res.dtype.na_value exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0) tm.assert_sp_array_equal(res, exp) def test_bad_take(self): with pytest.raises(IndexError, match="bounds"): self.arr.take([11]) def test_take_filling(self): # similar tests as GH 12631 sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) result = sparse.take(np.array([1, 0, -1])) expected = SparseArray([np.nan, np.nan, 4]) tm.assert_sp_array_equal(result, expected) # XXX: test change: fill_value=True -> allow_fill=True result = sparse.take(np.array([1, 0, -1]), allow_fill=True) expected = SparseArray([np.nan, np.nan, np.nan]) tm.assert_sp_array_equal(result, expected) # allow_fill=False result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([np.nan, np.nan, 4]) tm.assert_sp_array_equal(result, expected) msg = "Invalid value in 'indices'" with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -2]), allow_fill=True) with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -5]), allow_fill=True) with pytest.raises(IndexError): sparse.take(np.array([1, -6])) with pytest.raises(IndexError): sparse.take(np.array([1, 5])) with pytest.raises(IndexError): sparse.take(np.array([1, 5]), allow_fill=True) def test_take_filling_fill_value(self): # same tests as GH 12631 sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) result = sparse.take(np.array([1, 0, -1])) expected = SparseArray([0, np.nan, 4], fill_value=0) tm.assert_sp_array_equal(result, expected) # fill_value result = sparse.take(np.array([1, 0, -1]), allow_fill=True) # XXX: behavior change. # the old way of filling self.fill_value doesn't follow EA rules. # It's supposed to be self.dtype.na_value (nan in this case) expected = SparseArray([0, np.nan, np.nan], fill_value=0) tm.assert_sp_array_equal(result, expected) # allow_fill=False result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([0, np.nan, 4], fill_value=0) tm.assert_sp_array_equal(result, expected) msg = "Invalid value in 'indices'." with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -2]), allow_fill=True) with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -5]), allow_fill=True) with pytest.raises(IndexError): sparse.take(np.array([1, -6])) with pytest.raises(IndexError): sparse.take(np.array([1, 5])) with pytest.raises(IndexError): sparse.take(np.array([1, 5]), fill_value=True) def test_take_filling_all_nan(self): sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) # XXX: did the default kind from take change? result = sparse.take(np.array([1, 0, -1])) expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) result = sparse.take(np.array([1, 0, -1]), fill_value=True) expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) with pytest.raises(IndexError): sparse.take(np.array([1, -6])) with pytest.raises(IndexError): sparse.take(np.array([1, 5])) with pytest.raises(IndexError): sparse.take(np.array([1, 5]), fill_value=True) def test_set_item(self): def setitem(): self.arr[5] = 3 def setslice(): self.arr[1:5] = 2 with pytest.raises(TypeError, match="assignment via setitem"): setitem() with pytest.raises(TypeError, match="assignment via setitem"): setslice() def test_constructor_from_too_large_array(self): with pytest.raises(TypeError, match="expected dimension <= 1 data"): SparseArray(np.arange(10).reshape((2, 5))) def test_constructor_from_sparse(self): res = SparseArray(self.zarr) assert res.fill_value == 0 tm.assert_almost_equal(res.sp_values, self.zarr.sp_values) def test_constructor_copy(self): cp = SparseArray(self.arr, copy=True) cp.sp_values[:3] = 0 assert not (self.arr.sp_values[:3] == 0).any() not_copy = SparseArray(self.arr) not_copy.sp_values[:3] = 0 assert (self.arr.sp_values[:3] == 0).all() def test_constructor_bool(self): # GH 10648 data = np.array([False, False, True, True, False, False]) arr = SparseArray(data, fill_value=False, dtype=bool) assert arr.dtype == SparseDtype(bool) tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) dense = arr.to_dense() assert dense.dtype == bool tm.assert_numpy_array_equal(dense, data) def test_constructor_bool_fill_value(self): arr = SparseArray([True, False, True], dtype=None) assert arr.dtype == SparseDtype(np.bool) assert not arr.fill_value arr = SparseArray([True, False, True], dtype=np.bool) assert arr.dtype == SparseDtype(np.bool) assert not arr.fill_value arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True) assert arr.dtype == SparseDtype(np.bool, True) assert arr.fill_value def test_constructor_float32(self): # GH 10648 data = np.array([1.0, np.nan, 3], dtype=np.float32) arr = SparseArray(data, dtype=np.float32) assert arr.dtype == SparseDtype(np.float32) tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([0, 2], dtype=np.int32)) dense = arr.to_dense() assert dense.dtype == np.float32 tm.assert_numpy_array_equal(dense, data) def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("float64", fill_value=0) result = arr.astype(dtype) expected = SparseArray._simple_new( np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("int64", 0) result = arr.astype(dtype) expected = SparseArray._simple_new(np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype) tm.assert_sp_array_equal(result, expected) arr = SparseArray([0, np.nan, 0, 1], fill_value=0) with pytest.raises(ValueError, match="NA"): arr.astype("Sparse[i8]") def test_astype_bool(self): a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) # update fill value result = a.astype(SparseDtype(bool, False)) expected = SparseArray([True, False, False, True], dtype=SparseDtype(bool, False)) tm.assert_sp_array_equal(result, expected) def test_astype_all(self, any_real_dtype): vals = np.array([1, 2, 3]) arr = SparseArray(vals, fill_value=1) typ = np.dtype(any_real_dtype) res = arr.astype(typ) assert res.dtype == SparseDtype(typ, 1) assert res.sp_values.dtype == typ tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ)) @pytest.mark.parametrize( "array, dtype, expected", [ ( SparseArray([0, 1]), "float", SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), ), (SparseArray([0, 1]), bool, SparseArray([False, True])), ( SparseArray([0, 1], fill_value=1), bool, SparseArray([False, True], dtype=SparseDtype(bool, True)), ), pytest.param( SparseArray([0, 1]), "datetime64[ns]", SparseArray( np.array([0, 1], dtype="datetime64[ns]"), dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), ), marks=[pytest.mark.xfail(reason="NumPy-7619")], ), ( SparseArray([0, 1, 10]), str, SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( SparseArray([0, 1, 0]), object, SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), ), ], ) def test_astype_more(self, array, dtype, expected): result = array.astype(dtype) tm.assert_sp_array_equal(result, expected) def test_astype_nan_raises(self): arr = SparseArray([1.0, np.nan]) with pytest.raises(ValueError, match="Cannot convert non-finite"): arr.astype(int) def test_set_fill_value(self): arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan) arr.fill_value = 2 assert arr.fill_value == 2 arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) arr.fill_value = 2 assert arr.fill_value == 2 # XXX: this seems fine? You can construct an integer # sparsearray with NaN fill value, why not update one? # coerces to int # msg = "unable to set fill_value 3\\.1 to int64 dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = 3.1 assert arr.fill_value == 3.1 # msg = "unable to set fill_value nan to int64 dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = np.nan assert np.isnan(arr.fill_value) arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) arr.fill_value = True assert arr.fill_value # coerces to bool # msg = "unable to set fill_value 0 to bool dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = 0 assert arr.fill_value == 0 # msg = "unable to set fill_value nan to bool dtype" # with pytest.raises(ValueError, match=msg): arr.fill_value = np.nan assert np.isnan(arr.fill_value) @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]) def test_set_fill_invalid_non_scalar(self, val): arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) msg = "fill_value must be a scalar" with pytest.raises(ValueError, match=msg): arr.fill_value = val def test_copy(self): arr2 = self.arr.copy() assert arr2.sp_values is not self.arr.sp_values assert arr2.sp_index is self.arr.sp_index def test_values_asarray(self): tm.assert_almost_equal(self.arr.to_dense(), self.arr_data) @pytest.mark.parametrize( "data,shape,dtype", [ ([0, 0, 0, 0, 0], (5, ), None), ([], (0, ), None), ([0], (1, ), None), (["A", "A", np.nan, "B"], (4, ), np.object), ], ) def test_shape(self, data, shape, dtype): # GH 21126 out = SparseArray(data, dtype=dtype) assert out.shape == shape @pytest.mark.parametrize( "vals", [ [np.nan, np.nan, np.nan, np.nan, np.nan], [1, np.nan, np.nan, 3, np.nan], [1, np.nan, 0, 3, 0], ], ) @pytest.mark.parametrize("fill_value", [None, 0]) def test_dense_repr(self, vals, fill_value): vals = np.array(vals) arr = SparseArray(vals, fill_value=fill_value) res = arr.to_dense() tm.assert_numpy_array_equal(res, vals) with tm.assert_produces_warning(FutureWarning): res2 = arr.get_values() tm.assert_numpy_array_equal(res2, vals) def test_getitem(self): def _checkit(i): tm.assert_almost_equal(self.arr[i], self.arr.to_dense()[i]) for i in range(len(self.arr)): _checkit(i) _checkit(-i) def test_getitem_arraylike_mask(self): arr = SparseArray([0, 1, 2]) result = arr[[True, False, True]] expected = SparseArray([0, 2]) tm.assert_sp_array_equal(result, expected) def test_getslice(self): result = self.arr[:-3] exp = SparseArray(self.arr.to_dense()[:-3]) tm.assert_sp_array_equal(result, exp) result = self.arr[-4:] exp = SparseArray(self.arr.to_dense()[-4:]) tm.assert_sp_array_equal(result, exp) # two corner cases from Series result = self.arr[-12:] exp = SparseArray(self.arr) tm.assert_sp_array_equal(result, exp) result = self.arr[:-12] exp = SparseArray(self.arr.to_dense()[:0]) tm.assert_sp_array_equal(result, exp) def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) res = sparse[4:, ] # noqa: E231 exp = SparseArray(dense[4:, ]) # noqa: E231 tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) res = sparse[4:, ] # noqa: E231 exp = SparseArray(dense[4:, ], fill_value=0) # noqa: E231 tm.assert_sp_array_equal(res, exp) with pytest.raises(IndexError): sparse[4:, :] with pytest.raises(IndexError): # check numpy compat dense[4:, :] def test_boolean_slice_empty(self): arr = pd.SparseArray([0, 1, 2]) res = arr[[False, False, False]] assert res.dtype == arr.dtype @pytest.mark.parametrize( "op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) def test_binary_operators(self, op): op = getattr(operator, op) data1 = np.random.randn(20) data2 = np.random.randn(20) data1[::2] = np.nan data2[::3] = np.nan arr1 = SparseArray(data1) arr2 = SparseArray(data2) data1[::2] = 3 data2[::3] = 3 farr1 = SparseArray(data1, fill_value=3) farr2 = SparseArray(data2, fill_value=3) def _check_op(op, first, second): res = op(first, second) exp = SparseArray(op(first.to_dense(), second.to_dense()), fill_value=first.fill_value) assert isinstance(res, SparseArray) tm.assert_almost_equal(res.to_dense(), exp.to_dense()) res2 = op(first, second.to_dense()) assert isinstance(res2, SparseArray) tm.assert_sp_array_equal(res, res2) res3 = op(first.to_dense(), second) assert isinstance(res3, SparseArray) tm.assert_sp_array_equal(res, res3) res4 = op(first, 4) assert isinstance(res4, SparseArray) # Ignore this if the actual op raises (e.g. pow). try: exp = op(first.to_dense(), 4) exp_fv = op(first.fill_value, 4) except ValueError: pass else: tm.assert_almost_equal(res4.fill_value, exp_fv) tm.assert_almost_equal(res4.to_dense(), exp) with np.errstate(all="ignore"): for first_arr, second_arr in [(arr1, arr2), (farr1, farr2)]: _check_op(op, first_arr, second_arr) def test_pickle(self): def _check_roundtrip(obj): unpickled = tm.round_trip_pickle(obj) tm.assert_sp_array_equal(unpickled, obj) _check_roundtrip(self.arr) _check_roundtrip(self.zarr) def test_generator_warnings(self): sp_arr = SparseArray([1, 2, 3]) with warnings.catch_warnings(record=True) as w: warnings.filterwarnings(action="always", category=DeprecationWarning) warnings.filterwarnings(action="always", category=PendingDeprecationWarning) for _ in sp_arr: pass assert len(w) == 0 def test_fillna(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) res = s.fillna(-1) exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(-1) exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0]) res = s.fillna(-1) exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) res = s.fillna(-1) exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan]) res = s.fillna(-1) exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) res = s.fillna(-1) exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) # float dtype's fill_value is np.nan, replaced by -1 s = SparseArray([0.0, 0.0, 0.0, 0.0]) res = s.fillna(-1) exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1) tm.assert_sp_array_equal(res, exp) # int dtype shouldn't have missing. No changes. s = SparseArray([0, 0, 0, 0]) assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 res = s.fillna(-1) tm.assert_sp_array_equal(res, s) s = SparseArray([0, 0, 0, 0], fill_value=0) assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=0) tm.assert_sp_array_equal(res, exp) # fill_value can be nan if there is no missing hole. # only fill_value will be changed s = SparseArray([0, 0, 0, 0], fill_value=np.nan) assert s.dtype == SparseDtype(np.int64, fill_value=np.nan) assert np.isnan(s.fill_value) res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=-1) tm.assert_sp_array_equal(res, exp) def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) # filling with existing value doesn't replace existing value with # fill_value, i.e. existing 3 remains in sp_values res = s.fillna(3) exp = np.array([1, 3, 3, 3, 3], dtype=np.float64) tm.assert_numpy_array_equal(res.to_dense(), exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(3) exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) def test_nonzero(self): # Tests regression #21172. sa = pd.SparseArray( [float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) (result, ) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) (result, ) = sa.nonzero() tm.assert_numpy_array_equal(expected, result)
def _concat_same_type(cls, to_concat): fill_values = [x.fill_value for x in to_concat] fill_value = fill_values[0] # np.nan isn't a singleton, so we may end up with multiple # NaNs here, so we ignore tha all NA case too. if not (len(set(fill_values)) == 1 or isna(fill_values).all()): warnings.warn( "Concatenating sparse arrays with multiple fill " f"values: '{fill_values}'. Picking the first and " "converting the rest.", PerformanceWarning, stacklevel=6, ) keep = to_concat[0] to_concat2 = [keep] for arr in to_concat[1:]: to_concat2.append(cls(np.asarray(arr), fill_value=fill_value)) to_concat = to_concat2 values = [] length = 0 if to_concat: sp_kind = to_concat[0].kind else: sp_kind = "integer" if sp_kind == "integer": indices = [] for arr in to_concat: idx = arr.sp_index.to_int_index().indices.copy() idx += length # TODO: wraparound length += arr.sp_index.length values.append(arr.sp_values) indices.append(idx) data = np.concatenate(values) indices = np.concatenate(indices) sp_index = IntIndex(length, indices) else: # when concatenating block indices, we don't claim that you'll # get an identical index as concating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting # BlockIndex may have more blocs. blengths = [] blocs = [] for arr in to_concat: idx = arr.sp_index.to_block_index() values.append(arr.sp_values) blocs.append(idx.blocs.copy() + length) blengths.append(idx.blengths) length += arr.sp_index.length data = np.concatenate(values) blocs = np.concatenate(blocs) blengths = np.concatenate(blengths) sp_index = BlockIndex(length, blocs, blengths) return cls(data, sparse_index=sp_index, fill_value=fill_value)
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): data = Series(data, index=index) index = data.index.view() res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() elif not data.index.equals(index) or copy: # pragma: no cover # GH#19275 SingleBlockManager input should only be called # internally raise AssertionError('Cannot pass both SingleBlockManager ' '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') else: length = len(index) if data == fill_value or (isna(data) and isna(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = ibase.default_index(sparse_index.length) index = ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name