def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ from pandas.core.sparse.array import SparseArray fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] if len(set(fill_values)) > 1: raise ValueError("Cannot concatenate SparseArrays with different " "fill values") fill_value = fill_values[0] # TODO: Fix join unit generation so we aren't passed this. to_concat = [x if isinstance(x, SparseArray) else SparseArray(x.squeeze(), fill_value=fill_value) for x in to_concat] return SparseArray._concat_same_type(to_concat)
def set_value(self, label, value, takeable=False): """ Quickly set single value at passed label. If label is not contained, a new object is created with the label placed at the end of the result index Parameters ---------- label : object Partial indexing with MultiIndex not allowed value : object Scalar value takeable : interpret the index as indexers, default False Notes ----- This method *always* returns a new object. It is not particularly efficient but is provided for API compatibility with Series Returns ------- series : SparseSeries """ values = self.to_dense() # if the label doesn't exist, we will create a new object here # and possibily change the index new_values = values.set_value(label, value, takeable=takeable) if new_values is not None: values = new_values new_index = values.index values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, new_index) self._index = new_index
def _consolidate_inplace(self): new_values = np.concatenate([c.sp_values for c in self._chunks]) new_index = _concat_sparse_indexes([c.sp_index for c in self._chunks]) new_arr = SparseArray(new_values, sparse_index=new_index, fill_value=self.fill_value) self._chunks = [new_arr]
def _unpickle_series_compat(self, state): nd_state, own_state = state # recreate the ndarray data = np.empty(nd_state[1], dtype=nd_state[2]) np.ndarray.__setstate__(data, nd_state) index, fill_value, sp_index = own_state[:3] name = None if len(own_state) > 3: name = own_state[3] # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sp_index, fill_value=fill_value, copy=False) # recreate data = SingleBlockManager(data, index, fastpath=True) generic.NDFrame.__init__(self, data) self._set_axis(0, index) self.name = name
def as_sparse_array(self, kind=None, fill_value=None, copy=False): """ return my self as a sparse array, do not copy by default """ if fill_value is None: fill_value = self.fill_value if kind is None: kind = self.kind return SparseArray(self.values, sparse_index=self.sp_index, fill_value=fill_value, kind=kind, copy=copy)
def take(self, indices, axis=0, convert=True, *args, **kwargs): convert = nv.validate_take_with_convert(convert, args, kwargs) if not convert: msg = ("The 'convert' parameter is deprecated " "and will be removed in a future version.") warnings.warn(msg, FutureWarning, stacklevel=2) new_values = SparseArray.take(self.values, indices) new_index = self.index.take(indices) return self._constructor(new_values, index=new_index).__finalize__(self)
def _set_value(self, label, value, takeable=False): values = self.to_dense() # if the label doesn't exist, we will create a new object here # and possibly change the index new_values = values._set_value(label, value, takeable=takeable) if new_values is not None: values = new_values new_index = values.index values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, new_index) self._index = new_index
def _set_values(self, key, value): # this might be inefficient as we have to recreate the sparse array # rather than setting individual elements, but have to convert # the passed slice/boolean that's in dense space into a sparse indexer # not sure how to do that! if isinstance(key, Series): key = key.values values = self.values.to_dense() values[key] = _index.convert_scalar(values, value) values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, self.index)
def take(self, indices, axis=0, convert=True, *args, **kwargs): """ Sparse-compatible version of ndarray.take Returns ------- taken : ndarray """ convert = nv.validate_take_with_convert(convert, args, kwargs) new_values = SparseArray.take(self.values, indices) new_index = self.index.take(indices) return self._constructor(new_values, index=new_index).__finalize__(self)
def append(self, value): """ Append element or array-like chunk of data to the SparseList Parameters ---------- value: scalar or array-like """ if is_scalar(value): value = [value] sparr = SparseArray(value, fill_value=self.fill_value) self._chunks.append(sparr) self._consolidated = False
def sparse_reindex(self, new_index): """ Conform sparse values to new SparseIndex Parameters ---------- new_index : {BlockIndex, IntIndex} Returns ------- reindexed : SparseSeries """ if not isinstance(new_index, splib.SparseIndex): raise TypeError("new index must be a SparseIndex") values = self.values values = values.sp_index.to_int_index().reindex( values.sp_values.astype('float64'), values.fill_value, new_index) values = SparseArray(values, sparse_index=new_index, fill_value=self.values.fill_value) return self._constructor(values, index=self.index).__finalize__(self)
def wrapper(self, other): from pandas.core.sparse.array import ( SparseArray, _sparse_array_op, _wrap_result, _get_fill) if isinstance(other, np.ndarray): if len(self) != len(other): raise AssertionError("length mismatch: {self} vs. {other}" .format(self=len(self), other=len(other))) if not isinstance(other, SparseArray): dtype = getattr(other, 'dtype', None) other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) return _sparse_array_op(self, other, op, name) elif is_scalar(other): with np.errstate(all='ignore'): fill = op(_get_fill(self), np.asarray(other)) result = op(self.sp_values, other) return _wrap_result(name, result, self.sp_index, fill) else: # pragma: no cover raise TypeError('operation with {other} not supported' .format(other=type(other)))
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # TODO: Most of this should be refactored and shared with Series # 1. BlockManager -> array # 2. Series.index, Series.name, index, name reconciliation # 3. Implicit reindexing # 4. Implicit broadcasting # 5. Dict construction if data is None: data = [] elif isinstance(data, SingleBlockManager): index = data.index data = data.blocks[0].values elif isinstance(data, (ABCSeries, ABCSparseSeries)): index = data.index if index is None else index dtype = data.dtype if dtype is None else dtype name = data.name if name is None else name if index is not None: data = data.reindex(index) elif isinstance(data, compat.Mapping): data, index = Series()._init_dict(data, index=index) elif is_scalar(data) and index is not None: data = np.full(len(index), fill_value=data) super(SparseSeries, self).__init__( SparseArray(data, sparse_index=sparse_index, kind=kind, dtype=dtype, fill_value=fill_value, copy=copy), index=index, name=name, copy=False, fastpath=fastpath )
def isnotnull(self): arr = SparseArray(notnull(self.values.sp_values), sparse_index=self.values.sp_index, fill_value=notnull(self.fill_value)) return self._constructor(arr, index=self.index).__finalize__(self)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = '{prefix}{prefix_sep}{level}' if PY2 and (isinstance(prefix, text_type) or isinstance(prefix_sep, text_type) or isinstance(level, text_type)): fstr = u(fstr) return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def create_block(typestr, placement, item_shape=None, num_offset=0): """ Supported typestr: * float, f8, f4, f2 * int, i8, i4, i2, i1 * uint, u8, u4, u2, u1 * complex, c16, c8 * bool * object, string, O * datetime, dt, M8[ns], M8[ns, tz] * timedelta, td, m8[ns] * sparse (SparseArray with fill_value=0.0) * sparse_na (SparseArray with fill_value=np.nan) * category, category2 """ placement = BlockPlacement(placement) num_items = len(placement) if item_shape is None: item_shape = (N, ) shape = (num_items, ) + item_shape mat = get_numeric_mat(shape) if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1', 'uint', 'u8', 'u4', 'u2', 'u1'): values = mat.astype(typestr) + num_offset elif typestr in ('complex', 'c16', 'c8'): values = 1.j * (mat.astype(typestr) + num_offset) elif typestr in ('object', 'string', 'O'): values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], shape) elif typestr in ( 'b', 'bool', ): values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt', 'M8[ns]'): values = (mat * 1e9).astype('M8[ns]') elif typestr.startswith('M8[ns'): # datetime with tz m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr) assert m is not None, "incompatible typestr -> {0}".format(typestr) tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') elif typestr in ('category', ): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) elif typestr in ('category2', ): values = Categorical( ['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd']) elif typestr in ('sparse', 'sparse_na'): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) if typestr.endswith('_na'): fill_value = np.nan else: fill_value = 0.0 values = SparseArray( [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], fill_value=fill_value) arr = values.sp_values.view() arr += (num_offset - 1) else: raise ValueError('Unsupported typestr: "%s"' % typestr) return make_block(values, placement=placement, ndim=len(shape))
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): data = Series(data, index=index) index = data.index.view() res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() elif not data.index.equals(index) or copy: # pragma: no cover # GH#19275 SingleBlockManager input should only be called # internally raise AssertionError('Cannot pass both SingleBlockManager ' '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') else: length = len(index) if data == fill_value or (isna(data) and isna(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = ibase.default_index(sparse_index.length) index = ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): if index is None: index = data.index.view() data = Series(data) res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() else: data = data.reindex(index, copy=False) else: length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [ u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels ] dummy_cols = [ dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels) ] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ from pandas.core.sparse.array import SparseArray, _make_index def convert_sparse(x, axis): # coerce to native type if isinstance(x, SparseArray): x = x.get_values() x = x.ravel() if axis > 0: x = np.atleast_2d(x) return x if typs is None: typs = get_dtype_kinds(to_concat) if len(typs) == 1: # concat input as it is if all inputs are sparse # and have the same fill_value fill_values = set(c.fill_value for c in to_concat) if len(fill_values) == 1: sp_values = [c.sp_values for c in to_concat] indexes = [c.sp_index.to_int_index() for c in to_concat] indices = [] loc = 0 for idx in indexes: indices.append(idx.indices + loc) loc += idx.length sp_values = np.concatenate(sp_values) indices = np.concatenate(indices) sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index) return SparseArray(sp_values, sparse_index=sp_index, fill_value=to_concat[0].fill_value) # input may be sparse / dense mixed and may have different fill_value # input must contain sparse at least 1 sparses = [c for c in to_concat if is_sparse(c)] fill_values = [c.fill_value for c in sparses] sp_indexes = [c.sp_index for c in sparses] # densify and regular concat to_concat = [convert_sparse(x, axis) for x in to_concat] result = np.concatenate(to_concat, axis=axis) if not len(typs - set(['sparse', 'f', 'i'])): # sparsify if inputs are sparse and dense numerics # first sparse input's fill_value and SparseIndex is used result = SparseArray(result.ravel(), fill_value=fill_values[0], kind=sp_indexes[0]) else: # coerce to object if needed result = result.astype('object') return result