Esempio n. 1
0
def _concat_sparse(to_concat, axis=0, typs=None):
    """
    provide concatenation of an sparse/dense array of arrays each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : axis to provide concatenation
    typs : set of to_concat dtypes

    Returns
    -------
    a single array, preserving the combined dtypes
    """

    from pandas.core.arrays import SparseArray

    fill_values = [
        x.fill_value for x in to_concat if isinstance(x, SparseArray)
    ]
    fill_value = fill_values[0]

    # TODO: Fix join unit generation so we aren't passed this.
    to_concat = [
        x if isinstance(x, SparseArray) else SparseArray(x.squeeze(),
                                                         fill_value=fill_value)
        for x in to_concat
    ]

    return SparseArray._concat_same_type(to_concat)
Esempio n. 2
0
    def test_setitem_with_unaligned_sparse_value(self):
        df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
        sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0])

        df["new_column"] = sp_series
        expected = Series(SparseArray([1, 0, 0]), name="new_column")
        tm.assert_series_equal(df["new_column"], expected)
Esempio n. 3
0
    def test_abs_operator(self):
        arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
        res = abs(arr)
        exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
        tm.assert_sp_array_equal(exp, res)

        arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
        res = abs(arr)
        exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
        tm.assert_sp_array_equal(exp, res)
Esempio n. 4
0
    def test_invert_operator(self):
        arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool8)
        exp = SparseArray(
            np.invert([False, True, False, True]), fill_value=True, dtype=np.bool8
        )
        res = ~arr
        tm.assert_sp_array_equal(exp, res)

        arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
        res = ~arr
        exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
        tm.assert_sp_array_equal(exp, res)
Esempio n. 5
0
def test_invert(fill_value):
    arr = np.array([True, False, False, True])
    sparray = SparseArray(arr, fill_value=fill_value)
    result = ~sparray
    expected = SparseArray(~arr, fill_value=not fill_value)
    tm.assert_sp_array_equal(result, expected)

    result = ~pd.Series(sparray)
    expected = pd.Series(expected)
    tm.assert_series_equal(result, expected)

    result = ~pd.DataFrame({"A": sparray})
    expected = pd.DataFrame({"A": expected})
    tm.assert_frame_equal(result, expected)
Esempio n. 6
0
 def notna(self):
     arr = SparseArray(
         notna(self.values.sp_values),
         sparse_index=self.values.sp_index,
         fill_value=notna(self.fill_value),
     )
     return self._constructor(arr, index=self.index).__finalize__(self)
Esempio n. 7
0
    def _unpickle_series_compat(self, state):

        nd_state, own_state = state

        # recreate the ndarray
        data = np.empty(nd_state[1], dtype=nd_state[2])
        np.ndarray.__setstate__(data, nd_state)

        index, fill_value, sp_index = own_state[:3]
        name = None
        if len(own_state) > 3:
            name = own_state[3]

        # create a sparse array
        if not isinstance(data, SparseArray):
            data = SparseArray(data,
                               sparse_index=sp_index,
                               fill_value=fill_value,
                               copy=False)

        # recreate
        data = SingleBlockManager(data, index, fastpath=True)
        generic.NDFrame.__init__(self, data)

        self._set_axis(0, index)
        self.name = name
Esempio n. 8
0
def test_concat_sparse():
    # GH 23557
    a = Series(SparseArray([0, 1, 2]))
    expected = DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype(
        pd.SparseDtype(np.int64, 0))
    result = pd.concat([a, a], axis=1)
    tm.assert_frame_equal(result, expected)
Esempio n. 9
0
def _concat_sparse(to_concat, axis=0, typs=None):
    """
    provide concatenation of an sparse/dense array of arrays each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : axis to provide concatenation
    typs : set of to_concat dtypes

    Returns
    -------
    a single array, preserving the combined dtypes
    """

    from pandas.core.arrays import SparseArray

    fill_values = [x.fill_value for x in to_concat
                   if isinstance(x, SparseArray)]
    fill_value = fill_values[0]

    # TODO: Fix join unit generation so we aren't passed this.
    to_concat = [x if isinstance(x, SparseArray)
                 else SparseArray(x.squeeze(), fill_value=fill_value)
                 for x in to_concat]

    return SparseArray._concat_same_type(to_concat)
Esempio n. 10
0
    def test_setitem_with_sparse_value(self):
        # GH#8131
        df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
        sp_array = SparseArray([0, 0, 1])
        df["new_column"] = sp_array

        expected = Series(sp_array, name="new_column")
        tm.assert_series_equal(df["new_column"], expected)
Esempio n. 11
0
    def __init__(self, *args, **kwargs):
        """
        A pandas DataFrame wrapper for one of Table's numpy arrays:
            - sets index values corresponding to Orange's global row indices
              e.g. ['_o1', '_o2'] (allows Orange to handle selection)
            - remembers the array's role in the Table (attribute, class var, meta)
            - keeps the Variable objects, and uses them in back-to-table conversion,
              should a column name match a variable's name
            - stores weight values (legacy)

        Parameters
        ----------
        table : Table
        orange_role : Role, (default=Role.Attribute)
            When converting back to an orange table, the DataFrame will
            convert to the right role (attrs, class vars, or metas)
        """
        if len(args) <= 0 or not isinstance(args[0], Table):
            super().__init__(*args, **kwargs)
            return
        table = args[0]
        if 'orange_role' in kwargs:
            role = kwargs.pop('orange_role')
        elif len(args) >= 2:
            role = args[1]
        else:
            role = Role.Attribute

        if role == Role.Attribute:
            data = table.X
            vars_ = table.domain.attributes
        elif role == Role.ClassAttribute:
            data = table.Y
            vars_ = table.domain.class_vars
        else:  # if role == Role.Meta:
            data = table.metas
            vars_ = table.domain.metas

        index = ['_o' + str(id_) for id_ in table.ids]
        varsdict = {var._name: var for var in vars_}
        columns = varsdict.keys()

        if sp.issparse(data):
            data = data.asformat('csc')
            sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
            data = dict(enumerate(sparrays))
            super().__init__(data, index=index, **kwargs)
            self.columns = columns
            # a hack to keep Orange df _metadata in sparse->dense conversion
            self.sparse.to_dense = self.__patch_constructor(self.sparse.to_dense)
        else:
            super().__init__(data=data, index=index, columns=columns, **kwargs)

        self.orange_role = role
        self.orange_variables = varsdict
        self.orange_weights = (dict(zip(index, table.W))
                               if table.W.size > 0 else {})
        self.orange_attributes = table.attributes
Esempio n. 12
0
    def __init__(
        self,
        data=None,
        index=None,
        sparse_index=None,
        kind="block",
        fill_value=None,
        name=None,
        dtype=None,
        copy=False,
        fastpath=False,
    ):
        warnings.warn(depr_msg, FutureWarning, stacklevel=2)
        # TODO: Most of this should be refactored and shared with Series
        # 1. BlockManager -> array
        # 2. Series.index, Series.name, index, name reconciliation
        # 3. Implicit reindexing
        # 4. Implicit broadcasting
        # 5. Dict construction
        if data is None:
            data = []
        elif isinstance(data, SingleBlockManager):
            index = data.index
            data = data.blocks[0].values
        elif isinstance(data, (ABCSeries, ABCSparseSeries)):
            index = data.index if index is None else index
            dtype = data.dtype if dtype is None else dtype
            name = data.name if name is None else name

            if index is not None:
                data = data.reindex(index)

        elif isinstance(data, abc.Mapping):
            data, index = Series()._init_dict(data, index=index)

        elif is_scalar(data) and index is not None:
            data = np.full(len(index), fill_value=data)

        if isinstance(data, SingleBlockManager):
            # SparseArray doesn't accept SingleBlockManager
            index = data.index
            data = data.blocks[0].values

        super().__init__(
            SparseArray(
                data,
                sparse_index=sparse_index,
                kind=kind,
                dtype=dtype,
                fill_value=fill_value,
                copy=copy,
            ),
            index=index,
            name=name,
            copy=False,
            fastpath=fastpath,
        )
Esempio n. 13
0
    def as_sparse_array(self, kind=None, fill_value=None, copy=False):
        """ return my self as a sparse array, do not copy by default """

        if fill_value is None:
            fill_value = self.fill_value
        if kind is None:
            kind = self.kind
        return SparseArray(self.values, sparse_index=self.sp_index,
                           fill_value=fill_value, kind=kind, copy=copy)
Esempio n. 14
0
    def _set_value(self, label, value, takeable=False):
        values = self.to_dense()

        # if the label doesn't exist, we will create a new object here
        # and possibly change the index
        new_values = values._set_value(label, value, takeable=takeable)
        if new_values is not None:
            values = new_values
        new_index = values.index
        values = SparseArray(values, fill_value=self.fill_value, kind=self.kind)
        self._data = SingleBlockManager(values, new_index)
        self._index = new_index
Esempio n. 15
0
    def _set_values(self, key, value):

        # this might be inefficient as we have to recreate the sparse array
        # rather than setting individual elements, but have to convert
        # the passed slice/boolean that's in dense space into a sparse indexer
        # not sure how to do that!
        if isinstance(key, Series):
            key = key.values

        values = self.values.to_dense()
        values[key] = libindex.convert_scalar(values, value)
        values = SparseArray(values, fill_value=self.fill_value, kind=self.kind)
        self._data = SingleBlockManager(values, self.index)
Esempio n. 16
0
    def test_getitem_sparse_column_return_type_and_dtype(self):
        # https://github.com/pandas-dev/pandas/issues/23559
        data = SparseArray([0, 1])
        df = DataFrame({"A": data})
        expected = Series(data, name="A")
        result = df["A"]
        tm.assert_series_equal(result, expected)

        # Also check iloc and loc while we're here
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, expected)

        result = df.loc[:, "A"]
        tm.assert_series_equal(result, expected)
Esempio n. 17
0
    def __init__(self,
                 data=None,
                 index=None,
                 sparse_index=None,
                 kind='block',
                 fill_value=None,
                 name=None,
                 dtype=None,
                 copy=False,
                 fastpath=False):
        # TODO: Most of this should be refactored and shared with Series
        # 1. BlockManager -> array
        # 2. Series.index, Series.name, index, name reconciliation
        # 3. Implicit reindexing
        # 4. Implicit broadcasting
        # 5. Dict construction
        if data is None:
            data = []
        elif isinstance(data, SingleBlockManager):
            index = data.index
            data = data.blocks[0].values
        elif isinstance(data, (ABCSeries, ABCSparseSeries)):
            index = data.index if index is None else index
            dtype = data.dtype if dtype is None else dtype
            name = data.name if name is None else name

            if index is not None:
                data = data.reindex(index)

        elif isinstance(data, compat.Mapping):
            data, index = Series()._init_dict(data, index=index)

        elif is_scalar(data) and index is not None:
            data = np.full(len(index), fill_value=data)

        super(SparseSeries,
              self).__init__(SparseArray(data,
                                         sparse_index=sparse_index,
                                         kind=kind,
                                         dtype=dtype,
                                         fill_value=fill_value,
                                         copy=copy),
                             index=index,
                             name=name,
                             copy=False,
                             fastpath=fastpath)
Esempio n. 18
0
    def sparse_reindex(self, new_index):
        """
        Conform sparse values to new SparseIndex

        Parameters
        ----------
        new_index : {BlockIndex, IntIndex}

        Returns
        -------
        reindexed : SparseSeries
        """
        if not isinstance(new_index, splib.SparseIndex):
            raise TypeError("new index must be a SparseIndex")
        values = self.values
        values = values.sp_index.to_int_index().reindex(
            values.sp_values.astype("float64"), values.fill_value, new_index)
        values = SparseArray(values,
                             sparse_index=new_index,
                             fill_value=self.values.fill_value)
        return self._constructor(values, index=self.index).__finalize__(self)
Esempio n. 19
0
    def _set_value(self, label, value, takeable=False):
        """
        Quickly set single value at passed label. If label is not contained, a
        new object is created with the label placed at the end of the result
        index

        .. deprecated:: 0.21.0

        Please use .at[] or .iat[] accessors.

        Parameters
        ----------
        label : object
            Partial indexing with MultiIndex not allowed
        value : object
            Scalar value
        takeable : interpret the index as indexers, default False

        Notes
        -----
        This method *always* returns a new object. It is not particularly
        efficient but is provided for API compatibility with Series

        Returns
        -------
        series : SparseSeries
        """
        values = self.to_dense()

        # if the label doesn't exist, we will create a new object here
        # and possibly change the index
        new_values = values._set_value(label, value, takeable=takeable)
        if new_values is not None:
            values = new_values
        new_index = values.index
        values = SparseArray(values,
                             fill_value=self.fill_value,
                             kind=self.kind)
        self._data = SingleBlockManager(values, new_index)
        self._index = new_index
Esempio n. 20
0
def test_unary_op(op, fill_value):
    arr = np.array([0, 1, np.nan, 2])
    sparray = SparseArray(arr, fill_value=fill_value)
    result = op(sparray)
    expected = SparseArray(op(arr), fill_value=op(fill_value))
    tm.assert_sp_array_equal(result, expected)
Esempio n. 21
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na: bool = False,
    sparse: bool = False,
    drop_first: bool = False,
    dtype: Dtype | None = None,
) -> DataFrame:
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.dtype(np.uint8)
    # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
    # dtype[Any], Type[object]]"; expected "Type[Any]"
    dtype = np.dtype(dtype)  # type: ignore[arg-type]

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data) -> DataFrame:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:
        dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])

    index: Index | None
    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        fill_value: bool | float | int
        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == np.dtype(bool):
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        return concat(sparse_series, axis=1, copy=False)

    else:
        # take on axis=1 + transpose to ensure ndarray layout is column-major
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Esempio n. 22
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na=False,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:

        # PY2 embedded unicode, gh-22084
        def _make_col_name(prefix, prefix_sep, level):
            fstr = "{prefix}{prefix_sep}{level}"
            return fstr.format(prefix=prefix,
                               prefix_sep=prefix_sep,
                               level=level)

        dummy_cols = [
            _make_col_name(prefix, prefix_sep, level) for level in levels
        ]

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Esempio n. 23
0
def create_block(typestr,
                 placement,
                 item_shape=None,
                 num_offset=0,
                 maker=new_block):
    """
    Supported typestr:

        * float, f8, f4, f2
        * int, i8, i4, i2, i1
        * uint, u8, u4, u2, u1
        * complex, c16, c8
        * bool
        * object, string, O
        * datetime, dt, M8[ns], M8[ns, tz]
        * timedelta, td, m8[ns]
        * sparse (SparseArray with fill_value=0.0)
        * sparse_na (SparseArray with fill_value=np.nan)
        * category, category2

    """
    placement = BlockPlacement(placement)
    num_items = len(placement)

    if item_shape is None:
        item_shape = (N, )

    shape = (num_items, ) + item_shape

    mat = get_numeric_mat(shape)

    if typestr in (
            "float",
            "f8",
            "f4",
            "f2",
            "int",
            "i8",
            "i4",
            "i2",
            "i1",
            "uint",
            "u8",
            "u4",
            "u2",
            "u1",
    ):
        values = mat.astype(typestr) + num_offset
    elif typestr in ("complex", "c16", "c8"):
        values = 1.0j * (mat.astype(typestr) + num_offset)
    elif typestr in ("object", "string", "O"):
        values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset],
                            shape)
    elif typestr in ("b", "bool"):
        values = np.ones(shape, dtype=np.bool_)
    elif typestr in ("datetime", "dt", "M8[ns]"):
        values = (mat * 1e9).astype("M8[ns]")
    elif typestr.startswith("M8[ns"):
        # datetime with tz
        m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
        assert m is not None, f"incompatible typestr -> {typestr}"
        tz = m.groups()[0]
        assert num_items == 1, "must have only 1 num items for a tz-aware"
        values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)._data
    elif typestr in ("timedelta", "td", "m8[ns]"):
        values = (mat * 1).astype("m8[ns]")
    elif typestr in ("category", ):
        values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
    elif typestr in ("category2", ):
        values = Categorical(
            ["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
    elif typestr in ("sparse", "sparse_na"):
        # FIXME: doesn't support num_rows != 10
        assert shape[-1] == 10
        assert all(s == 1 for s in shape[:-1])
        if typestr.endswith("_na"):
            fill_value = np.nan
        else:
            fill_value = 0.0
        values = SparseArray(
            [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
            fill_value=fill_value,
        )
        arr = values.sp_values.view()
        arr += num_offset - 1
    else:
        raise ValueError(f'Unsupported typestr: "{typestr}"')

    return maker(values, placement=placement, ndim=len(shape))
Esempio n. 24
0
    if is_datetime64_dtype(any_numpy_dtype):
        assert isinstance(result, DatetimeArray)
    elif is_timedelta64_dtype(any_numpy_dtype):
        assert isinstance(result, TimedeltaArray)
    else:
        assert isinstance(result, PandasArray)


@pytest.mark.parametrize(
    "array, attr",
    [
        (pd.Categorical(["a", "b"]), "_codes"),
        (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"),
        (pd.core.arrays.integer_array([0, np.nan]), "_data"),
        (IntervalArray.from_breaks([0, 1]), "_left"),
        (SparseArray([0, 1]), "_sparse_values"),
        (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"),
        # tz-aware Datetime
        (
            DatetimeArray(
                np.array(["2000-01-01T12:00:00", "2000-01-02T12:00:00"],
                         dtype="M8[ns]"),
                dtype=DatetimeTZDtype(tz="US/Central"),
            ),
            "_data",
        ),
    ],
)
def test_array(array, attr, index_or_series):
    box = index_or_series
    if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: