Ejemplo n.º 1
0
def _try_cast(
    arr: list | np.ndarray,
    dtype: DtypeObj | None,
    copy: bool,
    raise_cast_failure: bool,
) -> ArrayLike:
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray or list
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.

    Returns
    -------
    np.ndarray or ExtensionArray
    """
    is_ndarray = isinstance(arr, np.ndarray)

    if dtype is None:
        # perf shortcut as this is the most common case
        if is_ndarray:
            arr = cast(np.ndarray, arr)
            if arr.dtype != object:
                return sanitize_to_nanoseconds(arr, copy=copy)

            out = maybe_infer_to_datetimelike(arr)
            if out is arr and copy:
                out = out.copy()
            return out

        else:
            # i.e. list
            varr = np.array(arr, copy=False)
            # filter out cases that we _dont_ want to go through
            #  maybe_infer_to_datetimelike
            if varr.dtype != object or varr.size == 0:
                return varr
            return maybe_infer_to_datetimelike(varr)

    elif isinstance(dtype, ExtensionDtype):
        # create an extension array from its dtype
        if isinstance(dtype, DatetimeTZDtype):
            # We can't go through _from_sequence because it handles dt64naive
            #  data differently; _from_sequence treats naive as wall times,
            #  while maybe_cast_to_datetime treats it as UTC
            #  see test_maybe_promote_any_numpy_dtype_with_datetimetz

            return maybe_cast_to_datetime(arr, dtype)
            # TODO: copy?

        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    elif is_object_dtype(dtype):
        if not is_ndarray:
            subarr = construct_1d_object_array_from_listlike(arr)
            return subarr
        return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)

    elif dtype.kind == "U":
        # TODO: test cases with arr.dtype.kind in ["m", "M"]
        return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)

    elif dtype.kind in ["m", "M"]:
        return maybe_cast_to_datetime(arr, dtype)

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats

            subarr = maybe_cast_to_integer_array(arr, dtype)
        else:
            # 4 tests fail if we move this to a try/except/else; see
            #  test_constructor_compound_dtypes, test_constructor_cast_failure
            #  test_constructor_dict_cast2, test_loc_setitem_dtype
            subarr = np.array(arr, dtype=dtype, copy=copy)

    except (ValueError, TypeError):
        if raise_cast_failure:
            raise
        else:
            # we only get here with raise_cast_failure False, which means
            #  called via the DataFrame constructor
            # GH#24435
            warnings.warn(
                f"Could not cast to {dtype}, falling back to object. This "
                "behavior is deprecated. In a future version, when a dtype is "
                "passed to 'DataFrame', either all columns will be cast to that "
                "dtype, or a TypeError will be raised.",
                FutureWarning,
                stacklevel=find_stack_level(),
            )
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Ejemplo n.º 2
0
def init_ndarray(values, index, columns, dtype=None, copy=False):
    # input must be a ndarray, list, Series, index

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = [values.name]
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # we could have a categorical type passed or coerced to 'category'
    # recast this to an arrays_to_mgr
    if (is_categorical_dtype(getattr(values, 'dtype', None)) or
            is_categorical_dtype(dtype)):

        if not hasattr(values, 'dtype'):
            values = prep_ndarray(values, copy=copy)
            values = values.ravel()
        elif copy:
            values = values.copy()

        index, columns = _get_axes(len(values), 1, index, columns)
        return arrays_to_mgr([values], columns, index, columns,
                             dtype=dtype)
    elif (is_datetime64tz_dtype(values) or
          is_extension_array_dtype(values)):
        # GH#19157
        if columns is None:
            columns = [0]
        return arrays_to_mgr([values], columns, index, columns,
                             dtype=dtype)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = prep_ndarray(values, copy=copy)

    if dtype is not None:
        if not is_dtype_equal(values.dtype, dtype):
            try:
                values = values.astype(dtype)
            except Exception as orig:
                e = ValueError("failed to cast to '{dtype}' (Exception "
                               "was: {orig})".format(dtype=dtype,
                                                     orig=orig))
                raise_with_traceback(e)

    index, columns = _get_axes(*values.shape, index=index, columns=columns)
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values):
        values = maybe_infer_to_datetimelike(values)

    return create_block_manager_from_blocks([values], [columns, index])
Ejemplo n.º 3
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # if the array preparation does a copy -> avoid this for ArrayManager,
    # since the copy is done on conversion to 1D arrays
    copy_on_sanitize = False if typ == "array" else copy

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype):
        # GH#19157

        if isinstance(values,
                      (np.ndarray, ExtensionArray)) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            # error: No overload variant of "__getitem__" of "ExtensionArray"
            # matches argument type "Tuple[slice, int]"
            values = [
                values[:, n]  # type: ignore[call-overload]
                for n in range(values.shape[1])
            ]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))
        else:
            columns = ensure_index(columns)

        return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

    elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy_on_sanitize)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        # GH#40110 see similar check inside sanitize_array
        rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

        values = sanitize_array(flat,
                                None,
                                dtype=dtype,
                                copy=copy_on_sanitize,
                                raise_cast_failure=rcf)

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i]))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i] for i in range(values.shape[1])]

        if copy:
            arrays = [arr.copy() for arr in arrays]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):
        obj_columns = list(values)
        maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
        # don't convert (and copy) the objects if no type inference occurs
        if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
            dvals_list = [
                ensure_block_shape(dval, 2) for dval in maybe_datetime
            ]
            block_values = [
                new_block_2d(dvals_list[n], placement=BlockPlacement(n))
                for n in range(len(dvals_list))
            ]
        else:
            bp = BlockPlacement(slice(len(columns)))
            nb = new_block_2d(values, placement=bp)
            block_values = [nb]
    else:
        bp = BlockPlacement(slice(len(columns)))
        nb = new_block_2d(values, placement=bp)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index],
                                            verify_integrity=False)
Ejemplo n.º 4
0
def sanitize_array(
    data,
    index: Index | None,
    dtype: DtypeObj | None = None,
    copy: bool = False,
    raise_cast_failure: bool = True,
    *,
    allow_2d: bool = False,
) -> ArrayLike:
    """
    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
    coerce to the dtype if specified.

    Parameters
    ----------
    data : Any
    index : Index or None, default None
    dtype : np.dtype, ExtensionDtype, or None, default None
    copy : bool, default False
    raise_cast_failure : bool, default True
    allow_2d : bool, default False
        If False, raise if we have a 2D Arraylike.

    Returns
    -------
    np.ndarray or ExtensionArray

    Notes
    -----
    raise_cast_failure=False is only intended to be True when called from the
    DataFrame constructor, as the dtype keyword there may be interpreted as only
    applying to a subset of columns, see GH#24435.
    """
    if isinstance(data, ma.MaskedArray):
        data = sanitize_masked_array(data)

    if isinstance(dtype, PandasDtype):
        # Avoid ending up with a PandasArray
        dtype = dtype.numpy_dtype

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    if isinstance(data, np.ndarray) and data.ndim == 0:
        if dtype is None:
            dtype = data.dtype
        data = lib.item_from_zerodim(data)
    elif isinstance(data, range):
        # GH#16804
        data = range_to_ndarray(data)
        copy = False

    if not is_list_like(data):
        if index is None:
            raise ValueError(
                "index must be specified when data is not list-like")
        data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
        return data

    # GH#846
    if isinstance(data, np.ndarray):
        if isinstance(data, np.matrix):
            data = data.A

        if dtype is not None and is_float_dtype(
                data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
            try:
                subarr = _try_cast(data, dtype, copy, True)
            except IntCastingNaNError:
                warnings.warn(
                    "In a future version, passing float-dtype values containing NaN "
                    "and an integer dtype will raise IntCastingNaNError "
                    "(subclass of ValueError) instead of silently ignoring the "
                    "passed dtype. To retain the old behavior, call Series(arr) or "
                    "DataFrame(arr) without passing a dtype.",
                    FutureWarning,
                    stacklevel=find_stack_level(),
                )
                subarr = np.array(data, copy=copy)
            except ValueError:
                if not raise_cast_failure:
                    # i.e. called via DataFrame constructor
                    warnings.warn(
                        "In a future version, passing float-dtype values and an "
                        "integer dtype to DataFrame will retain floating dtype "
                        "if they cannot be cast losslessly (matching Series behavior). "
                        "To retain the old behavior, use DataFrame(data).astype(dtype)",
                        FutureWarning,
                        stacklevel=find_stack_level(),
                    )
                    # GH#40110 until the deprecation is enforced, we _dont_
                    #  ignore the dtype for DataFrame, and _do_ cast even though
                    #  it is lossy.
                    dtype = cast(np.dtype, dtype)
                    return np.array(data, dtype=dtype, copy=copy)
                subarr = np.array(data, copy=copy)
        else:
            # we will try to copy by-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()

    else:
        if isinstance(data, (set, frozenset)):
            # Raise only for unordered sets, e.g., not for dict_keys
            raise TypeError(f"'{type(data).__name__}' type is unordered")

        # materialize e.g. generators, convert e.g. tuples, abc.ValueView
        if hasattr(data, "__array__"):
            # e.g. dask array GH#38645
            data = np.asarray(data)
        else:
            data = list(data)

        if dtype is not None or len(data) == 0:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
        else:
            subarr = maybe_convert_platform(data)
            if subarr.dtype == object:
                subarr = cast(np.ndarray, subarr)
                subarr = maybe_infer_to_datetimelike(subarr)

    subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)

    if isinstance(subarr, np.ndarray):
        # at this point we should have dtype be None or subarr.dtype == dtype
        dtype = cast(np.dtype, dtype)
        subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)

    return subarr
Ejemplo n.º 5
0
def init_ndarray(values, index, columns, dtype=None, copy=False):
    # input must be a ndarray, list, Series, index

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = [values.name]
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # we could have a categorical type passed or coerced to 'category'
    # recast this to an arrays_to_mgr
    if is_categorical_dtype(getattr(values, "dtype",
                                    None)) or is_categorical_dtype(dtype):

        if not hasattr(values, "dtype"):
            values = prep_ndarray(values, copy=copy)
            values = values.ravel()
        elif copy:
            values = values.copy()

        index, columns = _get_axes(len(values), 1, index, columns)
        return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
    elif is_extension_array_dtype(values):
        # GH#19157
        if columns is None:
            columns = [0]
        return arrays_to_mgr([values], columns, index, columns, dtype=dtype)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = prep_ndarray(values, copy=copy)

    if dtype is not None:
        if not is_dtype_equal(values.dtype, dtype):
            try:
                values = values.astype(dtype)
            except Exception as orig:
                e = ValueError("failed to cast to '{dtype}' (Exception "
                               "was: {orig})".format(dtype=dtype, orig=orig))
                raise_with_traceback(e)

    index, columns = _get_axes(*values.shape, index=index, columns=columns)
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
            for n in range(len(dvals_list)):
                if isinstance(dvals_list[n], np.ndarray):
                    dvals_list[n] = dvals_list[n].reshape(1, -1)

            from pandas.core.internals.blocks import make_block

            # TODO: What about re-joining object columns?
            block_values = [
                make_block(dvals_list[n], placement=[n])
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            block_values = [datelike_vals]
    else:
        block_values = [values]

    return create_block_manager_from_blocks(block_values, [columns, index])
Ejemplo n.º 6
0
def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj],
                   copy: bool, typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    if is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))

        return arrays_to_mgr(values,
                             columns,
                             index,
                             columns,
                             dtype=dtype,
                             typ=typ)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        if not is_integer_dtype(dtype):
            # TODO: skipping integer_dtype is needed to keep the tests passing,
            #  not clear it is correct
            # Note: we really only need _try_cast, but keeping to exposed funcs
            values = sanitize_array(flat,
                                    None,
                                    dtype=dtype,
                                    copy=copy,
                                    raise_cast_failure=True)
        else:
            try:
                values = construct_1d_ndarray_preserving_na(flat,
                                                            dtype=dtype,
                                                            copy=False)
            except Exception as err:
                # e.g. ValueError when trying to cast object dtype to float64
                msg = f"failed to cast to '{dtype}' (Exception was: {err})"
                raise ValueError(msg) from err
        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
            dvals_list = [ensure_block_shape(dval, 2) for dval in dvals_list]

            # TODO: What about re-joining object columns?
            dvals_list = [maybe_squeeze_dt64tz(x) for x in dvals_list]
            block_values = [
                new_block(dvals_list[n], placement=n, ndim=2)
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            datelike_vals = maybe_squeeze_dt64tz(datelike_vals)
            block_values = [datelike_vals]
    else:
        block_values = [maybe_squeeze_dt64tz(values)]

    return create_block_manager_from_blocks(block_values, [columns, index])
Ejemplo n.º 7
0
def init_ndarray(values, index, columns, dtype: Optional[DtypeObj],
                 copy: bool):
    # input must be a ndarray, list, Series, index

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = [values.name]
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    if is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))

        return arrays_to_mgr(values, columns, index, columns, dtype=dtype)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        try:
            values = construct_1d_ndarray_preserving_na(values.ravel(),
                                                        dtype=dtype,
                                                        copy=False).reshape(
                                                            values.shape)
        except Exception as orig:
            # e.g. ValueError when trying to cast object dtype to float64
            raise ValueError(
                f"failed to cast to '{dtype}' (Exception was: {orig})"
            ) from orig

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
            for n in range(len(dvals_list)):
                if isinstance(dvals_list[n], np.ndarray):
                    dvals_list[n] = dvals_list[n].reshape(1, -1)

            from pandas.core.internals.blocks import make_block

            # TODO: What about re-joining object columns?
            block_values = [
                make_block(dvals_list[n], placement=[n], ndim=2)
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            block_values = [datelike_vals]
    else:
        block_values = [values]

    return create_block_manager_from_blocks(block_values, [columns, index])
Ejemplo n.º 8
0
def _try_cast(
    arr: list | np.ndarray,
    dtype: DtypeObj | None,
    copy: bool,
    raise_cast_failure: bool,
) -> ArrayLike:
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray or list
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.

    Returns
    -------
    np.ndarray or ExtensionArray
    """
    is_ndarray = isinstance(arr, np.ndarray)

    if dtype is None:
        # perf shortcut as this is the most common case
        if is_ndarray:
            arr = cast(np.ndarray, arr)
            if arr.dtype != object:
                return sanitize_to_nanoseconds(arr, copy=copy)

            out = maybe_infer_to_datetimelike(arr)
            if out is arr and copy:
                out = out.copy()
            return out

        else:
            # i.e. list
            varr = np.array(arr, copy=False)
            # filter out cases that we _dont_ want to go through
            #  maybe_infer_to_datetimelike
            if varr.dtype != object or varr.size == 0:
                return varr
            return maybe_infer_to_datetimelike(varr)

    elif isinstance(dtype, ExtensionDtype):
        # create an extension array from its dtype
        if isinstance(dtype, DatetimeTZDtype):
            # We can't go through _from_sequence because it handles dt64naive
            #  data differently; _from_sequence treats naive as wall times,
            #  while maybe_cast_to_datetime treats it as UTC
            #  see test_maybe_promote_any_numpy_dtype_with_datetimetz

            # error: Incompatible return value type (got "Union[ExtensionArray,
            # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]")
            return maybe_cast_to_datetime(arr, dtype)  # type: ignore[return-value]
            # TODO: copy?

        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    elif is_object_dtype(dtype):
        if not is_ndarray:
            subarr = construct_1d_object_array_from_listlike(arr)
            return subarr
        return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats

            maybe_cast_to_integer_array(arr, dtype)
            subarr = arr
        else:
            subarr = maybe_cast_to_datetime(arr, dtype)
            if dtype is not None and dtype.kind == "M":
                return subarr

        if not isinstance(subarr, ABCExtensionArray):
            # 4 tests fail if we move this to a try/except/else; see
            #  test_constructor_compound_dtypes, test_constructor_cast_failure
            #  test_constructor_dict_cast2, test_loc_setitem_dtype
            subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy)

    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError) as err:
        if dtype is not None and raise_cast_failure:
            raise
        elif "Cannot cast" in str(err) or "cannot be converted to timedelta64" in str(
            err
        ):
            # via _disallow_mismatched_datetimelike
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Ejemplo n.º 9
0
def sanitize_array(
    data,
    index: Index | None,
    dtype: DtypeObj | None = None,
    copy: bool = False,
    raise_cast_failure: bool = True,
    *,
    allow_2d: bool = False,
) -> ArrayLike:
    """
    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
    coerce to the dtype if specified.

    Parameters
    ----------
    data : Any
    index : Index or None, default None
    dtype : np.dtype, ExtensionDtype, or None, default None
    copy : bool, default False
    raise_cast_failure : bool, default True
    allow_2d : bool, default False
        If False, raise if we have a 2D Arraylike.

    Returns
    -------
    np.ndarray or ExtensionArray

    Notes
    -----
    raise_cast_failure=False is only intended to be True when called from the
    DataFrame constructor, as the dtype keyword there may be interpreted as only
    applying to a subset of columns, see GH#24435.
    """
    if isinstance(data, ma.MaskedArray):
        data = sanitize_masked_array(data)

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    if isinstance(data, np.ndarray) and data.ndim == 0:
        if dtype is None:
            dtype = data.dtype
        data = lib.item_from_zerodim(data)
    elif isinstance(data, range):
        # GH#16804
        data = range_to_ndarray(data)
        copy = False

    if not is_list_like(data):
        if index is None:
            raise ValueError("index must be specified when data is not list-like")
        data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
        return data

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
            try:
                subarr = _try_cast(data, dtype, copy, True)
            except ValueError:
                subarr = np.array(data, copy=copy)
        else:
            # we will try to copy by-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()
        return subarr

    else:
        if isinstance(data, (set, frozenset)):
            # Raise only for unordered sets, e.g., not for dict_keys
            raise TypeError(f"'{type(data).__name__}' type is unordered")

        # materialize e.g. generators, convert e.g. tuples, abc.ValueView
        # TODO: non-standard array-likes we can convert to ndarray more efficiently?
        data = list(data)

        if dtype is not None or len(data) == 0:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
        else:
            # TODO: copy?
            subarr = maybe_convert_platform(data)
            if subarr.dtype == object:
                # Argument 1 to "maybe_infer_to_datetimelike" has incompatible
                # type "Union[ExtensionArray, ndarray]"; expected "ndarray"
                subarr = maybe_infer_to_datetimelike(subarr)  # type: ignore[arg-type]

    subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)

    if not (
        isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype)
    ):
        subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)

        is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype)
        if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype:
            inferred = lib.infer_dtype(subarr, skipna=False)
            if inferred in {"interval", "period"}:
                subarr = array(subarr)
                subarr = extract_array(subarr, extract_numpy=True)

    return subarr
Ejemplo n.º 10
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))

        return arrays_to_mgr(values,
                             columns,
                             index,
                             columns,
                             dtype=dtype,
                             typ=typ)

    if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        if not is_integer_dtype(dtype):
            # TODO: skipping integer_dtype is needed to keep the tests passing,
            #  not clear it is correct
            # Note: we really only need _try_cast, but keeping to exposed funcs
            values = sanitize_array(flat,
                                    None,
                                    dtype=dtype,
                                    copy=copy,
                                    raise_cast_failure=True)
        else:
            try:
                values = construct_1d_ndarray_preserving_na(flat,
                                                            dtype=dtype,
                                                            copy=False)
            except IntCastingNaNError:
                # following Series, we ignore the dtype and retain floating
                # values instead of casting nans to meaningless ints
                pass

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i].copy()))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i].copy() for i in range(values.shape[1])]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values]
            dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals]

            # TODO: What about re-joining object columns?
            block_values = [
                new_block(dvals_list[n], placement=n, ndim=2)
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            nb = new_block(datelike_vals,
                           placement=slice(len(columns)),
                           ndim=2)
            block_values = [nb]
    else:
        nb = new_block(values, placement=slice(len(columns)), ndim=2)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index])
Ejemplo n.º 11
0
def init_ndarray(values, index, columns, dtype=None, copy=False):
    # input must be a ndarray, list, Series, index

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = [values.name]
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # we could have a categorical type passed or coerced to 'category'
    # recast this to an arrays_to_mgr
    if (is_categorical_dtype(getattr(values, 'dtype', None)) or
            is_categorical_dtype(dtype)):

        if not hasattr(values, 'dtype'):
            values = prep_ndarray(values, copy=copy)
            values = values.ravel()
        elif copy:
            values = values.copy()

        index, columns = _get_axes(len(values), 1, index, columns)
        return arrays_to_mgr([values], columns, index, columns,
                             dtype=dtype)
    elif (is_datetime64tz_dtype(values) or
          is_extension_array_dtype(values)):
        # GH#19157
        if columns is None:
            columns = [0]
        return arrays_to_mgr([values], columns, index, columns,
                             dtype=dtype)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = prep_ndarray(values, copy=copy)

    if dtype is not None:
        if not is_dtype_equal(values.dtype, dtype):
            try:
                values = values.astype(dtype)
            except Exception as orig:
                e = ValueError("failed to cast to '{dtype}' (Exception "
                               "was: {orig})".format(dtype=dtype,
                                                     orig=orig))
                raise_with_traceback(e)

    index, columns = _get_axes(*values.shape, index=index, columns=columns)
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values):
        values = maybe_infer_to_datetimelike(values)

    return create_block_manager_from_blocks([values], [columns, index])