def fillna(self, value=None, method=None, limit=None): """ Fill missing values with `value`. Parameters ---------- value : scalar, optional method : str, optional .. warning:: Using 'method' will result in high memory use, as all `fill_value` methods will be converted to an in-memory ndarray limit : int, optional Returns ------- SparseArray Notes ----- When `value` is specified, the result's ``fill_value`` depends on ``self.fill_value``. The goal is to maintain low-memory use. If ``self.fill_value`` is NA, the result dtype will be ``SparseDtype(self.dtype, fill_value=value)``. This will preserve amount of memory used before and after filling. When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ if (method is None and value is None) or ( method is not None and value is not None ): raise ValueError("Must specify one of 'method' or 'value'.") elif method is not None: msg = "fillna with 'method' requires high memory usage." warnings.warn(msg, PerformanceWarning) filled = interpolate_2d(np.asarray(self), method=method, limit=limit) return type(self)(filled, fill_value=self.fill_value) else: new_values = np.where(isna(self.sp_values), value, self.sp_values) if self._null_fill_value: # This is essentially just updating the dtype. new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) else: new_dtype = self.dtype return self._simple_new(new_values, self._sparse_index, new_dtype)
def from_spmatrix(cls, data, index=None, columns=None): """ Create a new DataFrame from a scipy sparse matrix. .. versionadded:: 0.25.0 Parameters ---------- data : scipy.sparse.spmatrix Must be convertible to csc format. index, columns : Index, optional Row and column labels to use for the resulting DataFrame. Defaults to a RangeIndex. Returns ------- DataFrame Each column of the DataFrame is stored as a :class:`arrays.SparseArray`. Examples -------- >>> import scipy.sparse >>> mat = scipy.sparse.eye(3) >>> pd.DataFrame.sparse.from_spmatrix(mat) 0 1 2 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ from pandas import DataFrame from pandas._libs.sparse import IntIndex data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape # We need to make sure indices are sorted, as we create # IntIndex with no input validation (i.e. check_integrity=False ). # Indices may already be sorted in scipy in which case this adds # a small overhead. data.sort_indices() indices = data.indices indptr = data.indptr array_data = data.data dtype = SparseDtype(array_data.dtype, 0) arrays = [] for i in range(n_columns): sl = slice(indptr[i], indptr[i + 1]) idx = IntIndex(n_rows, indices[sl], check_integrity=False) arr = SparseArray._simple_new(array_data[sl], idx, dtype) arrays.append(arr) return DataFrame._from_arrays( arrays, columns=columns, index=index, verify_integrity=False )
def __setstate__(self, state): """Necessary for making this object picklable""" if isinstance(state, tuple): # Compat for pandas < 0.24.0 nd_state, (fill_value, sp_index) = state sparse_values = np.array([]) sparse_values.__setstate__(nd_state) self._sparse_values = sparse_values self._sparse_index = sp_index self._dtype = SparseDtype(sparse_values.dtype, fill_value) else: self.__dict__.update(state)
def from_spmatrix(cls, data): """ Create a SparseArray from a scipy.sparse matrix. .. versionadded:: 0.25.0 Parameters ---------- data : scipy.sparse.sp_matrix This should be a SciPy sparse matrix where the size of the second dimension is 1. In other words, a sparse matrix with a single column. Returns ------- SparseArray Examples -------- >>> import scipy.sparse >>> mat = scipy.sparse.coo_matrix((4, 1)) >>> pd.arrays.SparseArray.from_spmatrix(mat) [0.0, 0.0, 0.0, 0.0] Fill: 0.0 IntIndex Indices: array([], dtype=int32) """ length, ncol = data.shape if ncol != 1: raise ValueError(f"'data' must have a single column, not '{ncol}'") # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. arr = data.data idx, _ = data.nonzero() loc = np.argsort(idx) arr = arr.take(loc) idx.sort() zero = np.array(0, dtype=arr.dtype).item() dtype = SparseDtype(arr.dtype, zero) index = IntIndex(length, idx) return cls._simple_new(arr, index, dtype)
def _sparse_array_op(left: "SparseArray", right: "SparseArray", op: Callable, name: str) -> Any: """ Perform a binary operation between two arrays. Parameters ---------- left : Union[SparseArray, ndarray] right : Union[SparseArray, ndarray] op : Callable The binary operation to perform name str Name of the callable. Returns ------- SparseArray """ if name.startswith("__"): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] # dtype used to find corresponding sparse method ltype = left.dtype.subtype rtype = right.dtype.subtype if not is_dtype_equal(ltype, rtype): subtype = find_common_type([ltype, rtype]) ltype = SparseDtype(subtype, left.fill_value) rtype = SparseDtype(subtype, right.fill_value) # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe left = left.astype(ltype) right = right.astype(rtype) dtype = ltype.subtype else: dtype = ltype # dtype the result must have result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all="ignore"): result = op(left.to_dense(), right.to_dense()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all="ignore"): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == "r": left, right = right, left name = name[1:] if name in ("and", "or", "xor") and dtype == "bool": opname = f"sparse_{name}_uint8" # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = bool else: opname = f"sparse_{name}_{dtype}" left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all="ignore"): result, index, fill = sparse_op( left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value, ) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def isna(self): # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
def fill_value(self, value): self._dtype = SparseDtype(self.dtype.subtype, value)
def __init__( self, data, sparse_index=None, index=None, fill_value=None, kind="integer", dtype=None, copy=False, ): if fill_value is None and isinstance(dtype, SparseDtype): fill_value = dtype.fill_value if isinstance(data, type(self)): # disable normal inference on dtype, sparse_index, & fill_value if sparse_index is None: sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value if dtype is None: dtype = data.dtype # TODO: make kind=None, and use data.kind? data = data.sp_values # Handle use-provided dtype if isinstance(dtype, str): # Two options: dtype='int', regular numpy dtype # or dtype='Sparse[int]', a sparse dtype try: dtype = SparseDtype.construct_from_string(dtype) except TypeError: dtype = pandas_dtype(dtype) if isinstance(dtype, SparseDtype): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype if index is not None and not is_scalar(data): raise Exception("must only pass scalars with an index") if is_scalar(data): if index is not None and data is None: data = np.nan if index is not None: npoints = len(index) elif sparse_index is None: npoints = 1 else: npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] data = construct_1d_arraylike_from_scalar(data, npoints, dtype) if dtype is not None: dtype = pandas_dtype(dtype) # TODO: disentangle the fill_value dtype inference from # dtype inference if data is None: # TODO: What should the empty dtype be? Object or float? data = np.array([], dtype=dtype) if not is_array_like(data): try: # probably shared code in sanitize_series data = sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] # we retry with object dtype here. if dtype is None: dtype = object data = np.atleast_1d(np.asarray(data, dtype=dtype)) else: raise if copy: # TODO: avoid double copy when dtype forces cast. data = data.copy() if fill_value is None: fill_value_dtype = data.dtype if dtype is None else dtype if fill_value_dtype is None: fill_value = np.nan else: fill_value = na_value_for_dtype(fill_value_dtype) if isinstance(data, type(self)) and sparse_index is None: sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: data = extract_array(data, extract_numpy=True) if not isinstance(data, np.ndarray): # EA if is_datetime64tz_dtype(data.dtype): warnings.warn( f"Creating SparseArray from {data.dtype} data " "loses timezone information. Cast to object before " "sparse to retain timezone information.", UserWarning, stacklevel=2, ) data = np.asarray(data, dtype="datetime64[ns]") data = np.asarray(data) sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype) else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: raise AssertionError( f"Non array-like type {type(sparse_values)} must " "have the same length as the index") self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value)
def _unary_method(self, op) -> "SparseArray": fill_value = op(np.array(self.fill_value)).item() values = op(self.sp_values) dtype = SparseDtype(values.dtype, fill_value) return type(self)._simple_new(values, self.sp_index, dtype)