Exemple #1
0
def mgr_to_mgr(mgr, typ: str):
    """
    Convert to specific type of Manager. Does not copy if the type is already
    correct. Does not guarantee a copy otherwise.
    """
    new_mgr: Manager

    if typ == "block":
        if isinstance(mgr, BlockManager):
            new_mgr = mgr
        else:
            new_mgr = arrays_to_mgr(mgr.arrays,
                                    mgr.axes[0],
                                    mgr.axes[1],
                                    mgr.axes[0],
                                    typ="block")
    elif typ == "array":
        if isinstance(mgr, ArrayManager):
            new_mgr = mgr
        else:
            arrays = [
                mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))
            ]
            new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
    else:
        raise ValueError(
            f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
    return new_mgr
Exemple #2
0
def mgr_to_mgr(mgr, typ: str, copy: bool = True):
    """
    Convert to specific type of Manager. Does not copy if the type is already
    correct. Does not guarantee a copy otherwise. `copy` keyword only controls
    whether conversion from Block->ArrayManager copies the 1D arrays.
    """
    new_mgr: Manager

    if typ == "block":
        if isinstance(mgr, BlockManager):
            new_mgr = mgr
        else:
            if mgr.ndim == 2:
                new_mgr = arrays_to_mgr(
                    mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"
                )
            else:
                new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)
    elif typ == "array":
        if isinstance(mgr, ArrayManager):
            new_mgr = mgr
        else:
            if mgr.ndim == 2:
                arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
                if copy:
                    arrays = [arr.copy() for arr in arrays]
                new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
            else:
                array = mgr.internal_values()
                if copy:
                    array = array.copy()
                new_mgr = SingleArrayManager([array], [mgr.index])
    else:
        raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
    return new_mgr
Exemple #3
0
def arrays_to_mgr(
    arrays,
    columns: Index,
    index,
    *,
    dtype: DtypeObj | None = None,
    verify_integrity: bool = True,
    typ: str | None = None,
    consolidate: bool = True,
) -> Manager:
    """
    Segregate Series based on type and coerce into matrices.

    Needs to handle a lot of exceptional cases.
    """
    if verify_integrity:
        # figure out the index, if necessary
        if index is None:
            index = _extract_index(arrays)
        else:
            index = ensure_index(index)

        # don't force copy because getting jammed in an ndarray anyway
        arrays = _homogenize(arrays, index, dtype)
        # _homogenize ensures
        #  - all(len(x) == len(index) for x in arrays)
        #  - all(x.ndim == 1 for x in arrays)
        #  - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
        #  - all(type(x) is not PandasArray for x in arrays)

    else:
        index = ensure_index(index)
        arrays = [extract_array(x, extract_numpy=True) for x in arrays]

        # Reached via DataFrame._from_arrays; we do validation here
        for arr in arrays:
            if (not isinstance(arr, (np.ndarray, ExtensionArray))
                    or arr.ndim != 1 or len(arr) != len(index)):
                raise ValueError(
                    "Arrays must be 1-dimensional np.ndarray or ExtensionArray "
                    "with length matching len(index)")

    columns = ensure_index(columns)
    if len(columns) != len(arrays):
        raise ValueError("len(arrays) must match len(columns)")

    # from BlockManager perspective
    axes = [columns, index]

    if typ == "block":
        return create_block_manager_from_column_arrays(arrays,
                                                       axes,
                                                       consolidate=consolidate)
    elif typ == "array":
        return ArrayManager(arrays, [index, columns])
    else:
        raise ValueError(
            f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
Exemple #4
0
def concatenate_array_managers(mgrs_indexers, axes: List[Index],
                               concat_axis: int, copy: bool) -> Manager:
    """
    Concatenate array managers into one.

    Parameters
    ----------
    mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
    axes : list of Index
    concat_axis : int
    copy : bool

    Returns
    -------
    ArrayManager
    """
    # reindex all arrays
    mgrs = []
    for mgr, indexers in mgrs_indexers:
        for ax, indexer in indexers.items():
            mgr = mgr.reindex_indexer(axes[ax],
                                      indexer,
                                      axis=ax,
                                      allow_dups=True)
        mgrs.append(mgr)

    if concat_axis == 1:
        # concatting along the rows -> concat the reindexed arrays
        # TODO(ArrayManager) doesn't yet preserve the correct dtype
        arrays = [
            concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))])
            for j in range(len(mgrs[0].arrays))
        ]
        return ArrayManager(arrays, [axes[1], axes[0]],
                            do_integrity_check=False)
    else:
        # concatting along the columns -> combine reindexed arrays in a single manager
        assert concat_axis == 0
        arrays = list(
            itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
        return ArrayManager(arrays, [axes[1], axes[0]],
                            do_integrity_check=False)
Exemple #5
0
def arrays_to_mgr(
    arrays,
    arr_names,
    index,
    columns,
    *,
    dtype: DtypeObj | None = None,
    verify_integrity: bool = True,
    typ: str | None = None,
    consolidate: bool = True,
) -> Manager:
    """
    Segregate Series based on type and coerce into matrices.

    Needs to handle a lot of exceptional cases.
    """
    arr_names = ensure_index(arr_names)

    if verify_integrity:
        # figure out the index, if necessary
        if index is None:
            index = _extract_index(arrays)
        else:
            index = ensure_index(index)

        # don't force copy because getting jammed in an ndarray anyway
        arrays = _homogenize(arrays, index, dtype)

    else:
        index = ensure_index(index)

    columns = ensure_index(columns)

    # from BlockManager perspective
    axes = [columns, index]

    if typ == "block":
        return create_block_manager_from_arrays(arrays,
                                                arr_names,
                                                axes,
                                                consolidate=consolidate)
    elif typ == "array":
        if len(columns) != len(arrays):
            assert len(arrays) == 0
            arrays = [np.array([], dtype=object) for _ in range(len(columns))]
        return ArrayManager(arrays, [index, columns])
    else:
        raise ValueError(
            f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
Exemple #6
0
def _concatenate_array_managers(mgrs_indexers, axes: list[Index],
                                concat_axis: int, copy: bool) -> Manager:
    """
    Concatenate array managers into one.

    Parameters
    ----------
    mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
    axes : list of Index
    concat_axis : int
    copy : bool

    Returns
    -------
    ArrayManager
    """
    # reindex all arrays
    mgrs = []
    for mgr, indexers in mgrs_indexers:
        axis1_made_copy = False
        for ax, indexer in indexers.items():
            mgr = mgr.reindex_indexer(axes[ax],
                                      indexer,
                                      axis=ax,
                                      allow_dups=True,
                                      use_na_proxy=True)
            if ax == 1 and indexer is not None:
                axis1_made_copy = True
        if copy and concat_axis == 0 and not axis1_made_copy:
            # for concat_axis 1 we will always get a copy through concat_arrays
            mgr = mgr.copy()
        mgrs.append(mgr)

    if concat_axis == 1:
        # concatting along the rows -> concat the reindexed arrays
        # TODO(ArrayManager) doesn't yet preserve the correct dtype
        arrays = [
            concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
            for j in range(len(mgrs[0].arrays))
        ]
    else:
        # concatting along the columns -> combine reindexed arrays in a single manager
        assert concat_axis == 0
        arrays = list(
            itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))

    new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
    return new_mgr
Exemple #7
0
def arrays_to_mgr(
    arrays,
    arr_names,
    index,
    columns,
    dtype: Optional[DtypeObj] = None,
    verify_integrity: bool = True,
    typ: Optional[str] = None,
):
    """
    Segregate Series based on type and coerce into matrices.

    Needs to handle a lot of exceptional cases.
    """
    arr_names = ensure_index(arr_names)

    if verify_integrity:
        # figure out the index, if necessary
        if index is None:
            index = extract_index(arrays)
        else:
            index = ensure_index(index)

        # don't force copy because getting jammed in an ndarray anyway
        arrays = _homogenize(arrays, index, dtype)

        columns = ensure_index(columns)
    else:
        columns = ensure_index(columns)
        index = ensure_index(index)

    # from BlockManager perspective
    axes = [columns, index]

    if typ == "block":
        return create_block_manager_from_arrays(arrays, arr_names, axes)
    elif typ == "array":
        return ArrayManager(arrays, [index, columns])
    else:
        raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
Exemple #8
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # if the array preparation does a copy -> avoid this for ArrayManager,
    # since the copy is done on conversion to 1D arrays
    copy_on_sanitize = False if typ == "array" else copy

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
        # GH#19157

        if isinstance(values,
                      (np.ndarray, ExtensionArray)) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            # error: No overload variant of "__getitem__" of "ExtensionArray"
            # matches argument type "Tuple[slice, int]"
            values = [
                values[:, n]  # type: ignore[call-overload]
                for n in range(values.shape[1])
            ]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))
        else:
            columns = ensure_index(columns)

        return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

    elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ, PeriodDtype
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy_on_sanitize)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        # GH#40110 see similar check inside sanitize_array
        rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

        values = sanitize_array(flat,
                                None,
                                dtype=dtype,
                                copy=copy_on_sanitize,
                                raise_cast_failure=rcf)

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i]))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i] for i in range(values.shape[1])]

        if copy:
            arrays = [arr.copy() for arr in arrays]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):
        obj_columns = list(values)
        maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
        # don't convert (and copy) the objects if no type inference occurs
        if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
            dvals_list = [
                ensure_block_shape(dval, 2) for dval in maybe_datetime
            ]
            block_values = [
                new_block_2d(dvals_list[n], placement=BlockPlacement(n))
                for n in range(len(dvals_list))
            ]
        else:
            bp = BlockPlacement(slice(len(columns)))
            nb = new_block_2d(values, placement=bp)
            block_values = [nb]
    else:
        bp = BlockPlacement(slice(len(columns)))
        nb = new_block_2d(values, placement=bp)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index],
                                            verify_integrity=False)
Exemple #9
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))

        return arrays_to_mgr(values,
                             columns,
                             index,
                             columns,
                             dtype=dtype,
                             typ=typ)

    if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        if not is_integer_dtype(dtype):
            # TODO: skipping integer_dtype is needed to keep the tests passing,
            #  not clear it is correct
            # Note: we really only need _try_cast, but keeping to exposed funcs
            values = sanitize_array(flat,
                                    None,
                                    dtype=dtype,
                                    copy=copy,
                                    raise_cast_failure=True)
        else:
            try:
                values = construct_1d_ndarray_preserving_na(flat,
                                                            dtype=dtype,
                                                            copy=False)
            except IntCastingNaNError:
                # following Series, we ignore the dtype and retain floating
                # values instead of casting nans to meaningless ints
                pass

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i].copy()))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i].copy() for i in range(values.shape[1])]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values]
            dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals]

            # TODO: What about re-joining object columns?
            block_values = [
                new_block(dvals_list[n], placement=n, ndim=2)
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            nb = new_block(datelike_vals,
                           placement=slice(len(columns)),
                           ndim=2)
            block_values = [nb]
    else:
        nb = new_block(values, placement=slice(len(columns)), ndim=2)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index])
Exemple #10
0
def concatenate_block_managers(mgrs_indexers, axes: List[Index],
                               concat_axis: int, copy: bool) -> Manager:
    """
    Concatenate block managers into one.

    Parameters
    ----------
    mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
    axes : list of Index
    concat_axis : int
    copy : bool

    Returns
    -------
    BlockManager
    """
    if isinstance(mgrs_indexers[0][0], ArrayManager):

        if concat_axis == 1:
            # TODO for now only fastpath without indexers
            mgrs = [t[0] for t in mgrs_indexers]
            arrays = [
                concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))],
                              axis=0) for j in range(len(mgrs[0].arrays))
            ]
            return ArrayManager(arrays, [axes[1], axes[0]])
        elif concat_axis == 0:
            mgrs = [t[0] for t in mgrs_indexers]
            arrays = list(
                itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
            return ArrayManager(arrays, [axes[1], axes[0]])

    concat_plans = [
        _get_mgr_concatenation_plan(mgr, indexers)
        for mgr, indexers in mgrs_indexers
    ]
    concat_plan = _combine_concat_plans(concat_plans, concat_axis)
    blocks = []

    for placement, join_units in concat_plan:

        if len(join_units) == 1 and not join_units[0].indexers:
            b = join_units[0].block
            values = b.values
            if copy:
                values = values.copy()
            else:
                values = values.view()
            b = b.make_block_same_class(values, placement=placement)
        elif _is_uniform_join_units(join_units):
            blk = join_units[0].block
            vals = [ju.block.values for ju in join_units]

            if not blk.is_extension:
                # _is_uniform_join_units ensures a single dtype, so
                #  we can use np.concatenate, which is more performant
                #  than concat_compat
                values = np.concatenate(vals, axis=blk.ndim - 1)
            else:
                # TODO(EA2D): special-casing not needed with 2D EAs
                values = concat_compat(vals)
                if not isinstance(values, ExtensionArray):
                    values = values.reshape(1, len(values))

            if blk.values.dtype == values.dtype:
                # Fast-path
                b = blk.make_block_same_class(values, placement=placement)
            else:
                b = make_block(values, placement=placement, ndim=blk.ndim)
        else:
            b = make_block(
                _concatenate_join_units(join_units, concat_axis, copy=copy),
                placement=placement,
                ndim=len(axes),
            )
        blocks.append(b)

    return BlockManager(blocks, axes)