Beispiel #1
0
def _concatenate_join_units(
    join_units: list[JoinUnit], concat_axis: int, copy: bool
) -> ArrayLike:
    """
    Concatenate values from several join units along selected axis.
    """
    if concat_axis == 0 and len(join_units) > 1:
        # Concatenating join units along ax0 is handled in _merge_blocks.
        raise AssertionError("Concatenating join units along axis0")

    empty_dtype = _get_empty_dtype(join_units)

    has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
    upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)

    to_concat = [
        ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
        for ju in join_units
    ]

    if len(to_concat) == 1:
        # Only one block, nothing to concatenate.
        concat_values = to_concat[0]
        if copy:
            if isinstance(concat_values, np.ndarray):
                # non-reindexed (=not yet copied) arrays are made into a view
                # in JoinUnit.get_reindexed_values
                if concat_values.base is not None:
                    concat_values = concat_values.copy()
            else:
                concat_values = concat_values.copy()

    elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
        # TODO(EA2D): special case not needed if all EAs used HybridBlocks
        # NB: we are still assuming here that Hybrid blocks have shape (1, N)
        # concatting with at least one EA means we are concatting a single column
        # the non-EA values are 2D arrays with shape (1, n)

        # error: No overload variant of "__getitem__" of "ExtensionArray" matches
        # argument type "Tuple[int, slice]"
        to_concat = [
            t
            if is_1d_only_ea_dtype(t.dtype)
            else t[0, :]  # type: ignore[call-overload]
            for t in to_concat
        ]
        concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
        concat_values = ensure_block_shape(concat_values, 2)

    else:
        concat_values = concat_compat(to_concat, axis=concat_axis)

    return concat_values
Beispiel #2
0
def unstack(obj, level, fill_value=None):

    if isinstance(level, (tuple, list)):
        if len(level) != 1:
            # _unstack_multiple only handles MultiIndexes,
            # and isn't needed for a single level
            return _unstack_multiple(obj, level, fill_value=fill_value)
        else:
            level = level[0]

    # Prioritize integer interpretation (GH #21677):
    if not is_integer(level) and not level == "__placeholder__":
        level = obj.index._get_level_number(level)

    if isinstance(obj, DataFrame):
        if isinstance(obj.index, MultiIndex):
            return _unstack_frame(obj, level, fill_value=fill_value)
        else:
            return obj.T.stack(dropna=False)
    elif not isinstance(obj.index, MultiIndex):
        # GH 36113
        # Give nicer error messages when unstack a Series whose
        # Index is not a MultiIndex.
        raise ValueError(
            f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
        )
    else:
        if is_1d_only_ea_dtype(obj.dtype):
            return _unstack_extension_series(obj, level, fill_value)
        unstacker = _Unstacker(
            obj.index, level=level, constructor=obj._constructor_expanddim
        )
        return unstacker.get_result(
            obj._values, value_columns=None, fill_value=fill_value
        )
Beispiel #3
0
def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
    """
    Construct an np.ndarray or ExtensionArray of the given dtype and shape
    holding all-NA values.
    """
    if is_datetime64tz_dtype(dtype):
        # NaT here is analogous to dtype.na_value below
        i8values = np.full(shape, NaT.value)
        return DatetimeArray(i8values, dtype=dtype)

    elif is_1d_only_ea_dtype(dtype):
        dtype = cast(ExtensionDtype, dtype)
        cls = dtype.construct_array_type()

        missing_arr = cls._from_sequence([], dtype=dtype)
        nrows = shape[-1]
        taker = -1 * np.ones((nrows, ), dtype=np.intp)
        return missing_arr.take(taker,
                                allow_fill=True,
                                fill_value=dtype.na_value)
    elif isinstance(dtype, ExtensionDtype):
        # TODO: no tests get here, a handful would if we disabled
        #  the dt64tz special-case above (which is faster)
        cls = dtype.construct_array_type()
        missing_arr = cls._empty(shape=shape, dtype=dtype)
        missing_arr[:] = dtype.na_value
        return missing_arr
    else:
        # NB: we should never get here with dtype integer or bool;
        #  if we did, the missing_arr.fill would cast to gibberish
        missing_arr = np.empty(shape, dtype=dtype)
        fill_value = _dtype_to_na_value(dtype)
        missing_arr.fill(fill_value)
        return missing_arr
Beispiel #4
0
    def cython_operation(
        self,
        *,
        values: ArrayLike,
        axis: int,
        min_count: int = -1,
        comp_ids: np.ndarray,
        ngroups: int,
        **kwargs,
    ) -> ArrayLike:
        """
        Call our cython function, with appropriate pre- and post- processing.
        """
        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            assert axis == 1, axis
        elif not is_1d_only_ea_dtype(values.dtype):
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 0

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(dtype, is_numeric)

        if not isinstance(values, np.ndarray):
            # i.e. ExtensionArray
            return self._ea_wrap_cython_operation(
                values,
                min_count=min_count,
                ngroups=ngroups,
                comp_ids=comp_ids,
                **kwargs,
            )

        return self._cython_op_ndim_compat(
            values,
            min_count=min_count,
            ngroups=ngroups,
            comp_ids=comp_ids,
            mask=None,
            **kwargs,
        )
Beispiel #5
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # if the array preparation does a copy -> avoid this for ArrayManager,
    # since the copy is done on conversion to 1D arrays
    copy_on_sanitize = False if typ == "array" else copy

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
        # GH#19157

        if isinstance(values,
                      (np.ndarray, ExtensionArray)) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            # error: No overload variant of "__getitem__" of "ExtensionArray"
            # matches argument type "Tuple[slice, int]"
            values = [
                values[:, n]  # type: ignore[call-overload]
                for n in range(values.shape[1])
            ]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))
        else:
            columns = ensure_index(columns)

        return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

    elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ, PeriodDtype
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy_on_sanitize)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        # GH#40110 see similar check inside sanitize_array
        rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

        values = sanitize_array(flat,
                                None,
                                dtype=dtype,
                                copy=copy_on_sanitize,
                                raise_cast_failure=rcf)

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i]))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i] for i in range(values.shape[1])]

        if copy:
            arrays = [arr.copy() for arr in arrays]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):
        obj_columns = list(values)
        maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
        # don't convert (and copy) the objects if no type inference occurs
        if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
            dvals_list = [
                ensure_block_shape(dval, 2) for dval in maybe_datetime
            ]
            block_values = [
                new_block_2d(dvals_list[n], placement=BlockPlacement(n))
                for n in range(len(dvals_list))
            ]
        else:
            bp = BlockPlacement(slice(len(columns)))
            nb = new_block_2d(values, placement=bp)
            block_values = [nb]
    else:
        bp = BlockPlacement(slice(len(columns)))
        nb = new_block_2d(values, placement=bp)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index],
                                            verify_integrity=False)
Beispiel #6
0
    def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
        if upcasted_na is None:
            # No upcasting is necessary
            fill_value = self.block.fill_value
            values = self.block.get_values()
        else:
            fill_value = upcasted_na

            if self.is_valid_na_for(empty_dtype):
                blk_dtype = getattr(self.block, "dtype", None)

                if blk_dtype == np.dtype("object"):
                    # we want to avoid filling with np.nan if we are
                    # using None; we already know that we are all
                    # nulls
                    values = self.block.values.ravel(order="K")
                    if len(values) and values[0] is None:
                        fill_value = None

                if is_datetime64tz_dtype(empty_dtype):
                    i8values = np.full(self.shape, fill_value.value)
                    return DatetimeArray(i8values, dtype=empty_dtype)

                elif is_extension_array_dtype(blk_dtype):
                    pass

                elif is_1d_only_ea_dtype(empty_dtype):
                    empty_dtype = cast(ExtensionDtype, empty_dtype)
                    cls = empty_dtype.construct_array_type()

                    missing_arr = cls._from_sequence([], dtype=empty_dtype)
                    ncols, nrows = self.shape
                    assert ncols == 1, ncols
                    empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
                    return missing_arr.take(
                        empty_arr, allow_fill=True, fill_value=fill_value
                    )
                else:
                    # NB: we should never get here with empty_dtype integer or bool;
                    #  if we did, the missing_arr.fill would cast to gibberish
                    empty_dtype = cast(np.dtype, empty_dtype)

                    missing_arr = np.empty(self.shape, dtype=empty_dtype)
                    missing_arr.fill(fill_value)
                    return missing_arr

            if (not self.indexers) and (not self.block._can_consolidate):
                # preserve these for validation in concat_compat
                return self.block.values

            if self.block.is_bool:
                # External code requested filling/upcasting, bool values must
                # be upcasted to object to avoid being upcasted to numeric.
                values = self.block.astype(np.object_).values
            else:
                # No dtype upcasting is done here, it will be performed during
                # concatenation itself.
                values = self.block.values

        if not self.indexers:
            # If there's no indexing to be done, we want to signal outside
            # code that this array must be copied explicitly.  This is done
            # by returning a view and checking `retval.base`.
            values = values.view()

        else:
            for ax, indexer in self.indexers.items():
                values = algos.take_nd(values, indexer, axis=ax)

        return values
Beispiel #7
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))

        return arrays_to_mgr(values,
                             columns,
                             index,
                             columns,
                             dtype=dtype,
                             typ=typ)

    if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        if not is_integer_dtype(dtype):
            # TODO: skipping integer_dtype is needed to keep the tests passing,
            #  not clear it is correct
            # Note: we really only need _try_cast, but keeping to exposed funcs
            values = sanitize_array(flat,
                                    None,
                                    dtype=dtype,
                                    copy=copy,
                                    raise_cast_failure=True)
        else:
            try:
                values = construct_1d_ndarray_preserving_na(flat,
                                                            dtype=dtype,
                                                            copy=False)
            except IntCastingNaNError:
                # following Series, we ignore the dtype and retain floating
                # values instead of casting nans to meaningless ints
                pass

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i].copy()))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i].copy() for i in range(values.shape[1])]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values]
            dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals]

            # TODO: What about re-joining object columns?
            block_values = [
                new_block(dvals_list[n], placement=n, ndim=2)
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            nb = new_block(datelike_vals,
                           placement=slice(len(columns)),
                           ndim=2)
            block_values = [nb]
    else:
        nb = new_block(values, placement=slice(len(columns)), ndim=2)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index])
Beispiel #8
0
    def get_reindexed_values(self, empty_dtype: DtypeObj,
                             upcasted_na) -> ArrayLike:
        values: ArrayLike

        if upcasted_na is None and not self.is_na:
            # No upcasting is necessary
            fill_value = self.block.fill_value
            values = self.block.get_values()
        else:
            fill_value = upcasted_na

            if self.is_na:

                if is_datetime64tz_dtype(empty_dtype):
                    i8values = np.full(self.shape, fill_value.value)
                    return DatetimeArray(i8values, dtype=empty_dtype)

                elif is_1d_only_ea_dtype(empty_dtype):
                    empty_dtype = cast(ExtensionDtype, empty_dtype)
                    cls = empty_dtype.construct_array_type()

                    missing_arr = cls._from_sequence([], dtype=empty_dtype)
                    ncols, nrows = self.shape
                    assert ncols == 1, ncols
                    empty_arr = -1 * np.ones((nrows, ), dtype=np.intp)
                    return missing_arr.take(empty_arr,
                                            allow_fill=True,
                                            fill_value=fill_value)
                elif isinstance(empty_dtype, ExtensionDtype):
                    # TODO: no tests get here, a handful would if we disabled
                    #  the dt64tz special-case above (which is faster)
                    cls = empty_dtype.construct_array_type()
                    missing_arr = cls._empty(shape=self.shape,
                                             dtype=empty_dtype)
                    missing_arr[:] = fill_value
                    return missing_arr
                else:
                    # NB: we should never get here with empty_dtype integer or bool;
                    #  if we did, the missing_arr.fill would cast to gibberish
                    missing_arr = np.empty(self.shape, dtype=empty_dtype)
                    missing_arr.fill(fill_value)
                    return missing_arr

            if (not self.indexers) and (not self.block._can_consolidate):
                # preserve these for validation in concat_compat
                return self.block.values

            if self.block.is_bool:
                # External code requested filling/upcasting, bool values must
                # be upcasted to object to avoid being upcasted to numeric.
                values = self.block.astype(np.object_).values
            else:
                # No dtype upcasting is done here, it will be performed during
                # concatenation itself.
                values = self.block.values

        if not self.indexers:
            # If there's no indexing to be done, we want to signal outside
            # code that this array must be copied explicitly.  This is done
            # by returning a view and checking `retval.base`.
            values = values.view()

        else:
            for ax, indexer in self.indexers.items():
                values = algos.take_nd(values, indexer, axis=ax)

        return values