Example #1
def _ensure_datetimelike_to_i8(other, to_utc=False):
    Helper for coercing an input scalar or array to i8.

    other : 1d array
    to_utc : bool, default False
        If True, convert the values to UTC before extracting the i8 values
        If False, extract the i8 values directly.

    i8 1d array
    from pandas import Index
    from pandas.core.arrays import PeriodArray

    if lib.is_scalar(other) and isna(other):
        return iNaT
    elif isinstance(other, (PeriodArray, ABCIndexClass)):
        # convert tz if needed
        if getattr(other, 'tz', None) is not None:
            if to_utc:
                other = other.tz_convert('UTC')
                other = other.tz_localize(None)
            return np.array(other, copy=False).view('i8')
        except TypeError:
            # period array cannot be coerced to int
            other = Index(other)
    return other.asi8
Example #2
def _nanpercentile_1d(values, mask, q, na_value, interpolation):
    Wraper for np.percentile that skips missing values, specialized to
    1-dimensional case.

    values : array over which to find quantiles
    mask : ndarray[bool]
        locations in values that should be considered missing
    q : scalar or array of quantile indices to find
    na_value : scalar
        value to return for empty or all-null values
    interpolation : str

    quantiles : scalar or array
    # mask is Union[ExtensionArray, ndarray]
    values = values[~mask]

    if len(values) == 0:
        if lib.is_scalar(q):
            return na_value
            return np.array([na_value] * len(q),

    return np.percentile(values, q, interpolation=interpolation)
Example #3
    def _evaluate_compare(self, other, op):
        We have been called because a comparison between
        8 aware arrays. numpy >= 1.11 will
        now warn about NaT comparisons
        # Called by comparison methods when comparing datetimelike
        # with datetimelike

        if not isinstance(other, type(self)):
            # coerce to a similar object
            if not is_list_like(other):
                # scalar
                other = [other]
            elif lib.is_scalar(lib.item_from_zerodim(other)):
                # ndarray scalar
                other = [other.item()]
            other = type(self)(other)

        # compare
        result = op(self.asi8, other.asi8)

        # technically we could support bool dtyped Index
        # for now just return the indexing array directly
        mask = (self._isnan) | (other._isnan)

        filler = iNaT
        if is_bool_dtype(result):
            filler = False

        result[mask] = filler
        return result
Example #4
    def __getitem__(self, item):
        if isinstance(item, type(self)):
            item = item._ndarray

        result = self._ndarray[item]
        if not lib.is_scalar(result):
            result = type(self)(result)
        return result
Example #5
    def __setitem__(self, key, value):
        from pandas.core.internals.arrays import extract_array

        value = extract_array(value, extract_numpy=True)

        if not lib.is_scalar(key) and is_list_like(key):
            key = np.asarray(key)

        if not lib.is_scalar(value):
            value = np.asarray(value)

        values = self._ndarray
        t = np.result_type(value, values)
        if t != self._ndarray.dtype:
            values = values.astype(t, casting='safe')
            values[key] = value
            self._dtype = PandasDtype(t)
            self._ndarray = values
            self._ndarray[key] = value
Example #6
    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
        # Lightly modified version of
        # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\
        # numpy.lib.mixins.NDArrayOperatorsMixin.html
        # The primary modification is not boxing scalar return values
        # in PandasArray, since pandas' ExtensionArrays are 1-d.
        out = kwargs.get('out', ())
        for x in inputs + out:
            # Only support operations with instances of _HANDLED_TYPES.
            # Use PandasArray instead of type(self) for isinstance to
            # allow subclasses that don't override __array_ufunc__ to
            # handle PandasArray objects.
            if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)):
                return NotImplemented

        # Defer to the implementation of the ufunc on unwrapped values.
        inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x
                       for x in inputs)
        if out:
            kwargs['out'] = tuple(
                x._ndarray if isinstance(x, PandasArray) else x
                for x in out)
        result = getattr(ufunc, method)(*inputs, **kwargs)

        if type(result) is tuple and len(result):
            # multiple return values
            if not lib.is_scalar(result[0]):
                # re-box array-like results
                return tuple(type(self)(x) for x in result)
                # but not scalar reductions
                return result
        elif method == 'at':
            # no return value
            return None
            # one return value
            if not lib.is_scalar(result):
                # re-box array-like results, but not scalar reductions
                result = type(self)(result)
            return result
Example #7
    def __getitem__(self, key):
        This getitem defers to the underlying array, which by-definition can
        only handle list-likes, slices, and integer scalars

        is_int = lib.is_integer(key)
        if lib.is_scalar(key) and not is_int:
            raise IndexError("only integers, slices (`:`), ellipsis (`...`), "
                             "numpy.newaxis (`None`) and integer or boolean "
                             "arrays are valid indices")

        getitem = self._data.__getitem__
        if is_int:
            val = getitem(key)
            return self._box_func(val)

        if com.is_bool_indexer(key):
            key = np.asarray(key, dtype=bool)
            if key.all():
                key = slice(0, None, None)
                key = lib.maybe_booleans_to_slice(key.view(np.uint8))

        attribs = self._get_attributes_dict()

        is_period = is_period_dtype(self)
        if is_period:
            freq = self.freq
            freq = None
            if isinstance(key, slice):
                if self.freq is not None and key.step is not None:
                    freq = key.step * self.freq
                    freq = self.freq
            elif key is Ellipsis:
                # GH#21282 indexing with Ellipsis is similar to a full slice,
                #  should preserve `freq` attribute
                freq = self.freq

        attribs['freq'] = freq

        result = getitem(key)
        if result.ndim > 1:
            # To support MPL which performs slicing with 2 dim
            # even though it only has 1 dim by definition
            if is_period:
                return self._simple_new(result, **attribs)
            return result

        return self._simple_new(result, **attribs)
Example #8
    def wrapper(self, other):
        meth = getattr(dtl.DatetimeLikeArrayMixin, opname)

        if isinstance(other, (datetime, np.datetime64, compat.string_types)):
            if isinstance(other, (datetime, np.datetime64)):
                # GH#18435 strings get a pass from tzawareness compat

                other = _to_m8(other, tz=self.tz)
            except ValueError:
                # string that cannot be parsed to Timestamp
                return ops.invalid_comparison(self, other, op)

            result = meth(self, other)
            if isna(other):
        elif lib.is_scalar(other):
            return ops.invalid_comparison(self, other, op)
            if isinstance(other, list):
                # FIXME: This can break for object-dtype with mixed types
                other = type(self)(other)
            elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)):
                # Following Timestamp convention, __eq__ is all-False
                # and __ne__ is all True, others raise TypeError.
                return ops.invalid_comparison(self, other, op)

            if is_object_dtype(other):
                result = op(self.astype('O'), np.array(other))
            elif not (is_datetime64_dtype(other) or
                # e.g. is_timedelta64_dtype(other)
                return ops.invalid_comparison(self, other, op)
                result = meth(self, np.asarray(other))

            result = com.values_from_object(result)

            # Make sure to pass an array to result[...]; indexing with
            # Series breaks with older version of numpy
            o_mask = np.array(isna(other))
            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        return result
Example #9
    def __truediv__(self, other):
        # timedelta / X is well-defined for timedelta-like or numeric X
        other = lib.item_from_zerodim(other)

        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
            return NotImplemented

        if isinstance(other, (timedelta, np.timedelta64, Tick)):
            other = Timedelta(other)
            if other is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                return result

            # otherwise, dispatch to Timedelta implementation
            return self._data / other

        elif lib.is_scalar(other):
            # assume it is numeric
            result = self._data / other
            freq = None
            if self.freq is not None:
                # Tick division is not implemented, so operate on Timedelta
                freq = self.freq.delta / other
            return type(self)(result, freq=freq)

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other):
            # let numpy handle it
            return self._data / other

        elif is_object_dtype(other):
            # Note: we do not do type inference on the result, so either
            #  an object array or numeric-dtyped (if numpy does inference)
            #  will be returned.  GH#23829
            result = [self[n] / other[n] for n in range(len(self))]
            result = np.array(result)
            return result

            result = self._data / other
            return type(self)(result)
Example #10
    def __rtruediv__(self, other):
        # X / timedelta is defined only for timedelta-like X
        other = lib.item_from_zerodim(other)

        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
            return NotImplemented

        if isinstance(other, (timedelta, np.timedelta64, Tick)):
            other = Timedelta(other)
            if other is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                return result

            # otherwise, dispatch to Timedelta implementation
            return other / self._data

        elif lib.is_scalar(other):
            raise TypeError("Cannot divide {typ} by {cls}"

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other):
            # let numpy handle it
            return other / self._data

        elif is_object_dtype(other):
            # Note: unlike in __truediv__, we do not _need_ to do type#
            #  inference on the result.  It does not raise, a numeric array
            #  is returned.  GH#23829
            result = [other[n] / self[n] for n in range(len(self))]
            return np.array(result)

            raise TypeError("Cannot divide {dtype} data by {cls}"
Example #11
    def __rtruediv__(self, other):
        # X / timedelta is defined only for timedelta-like X
        if isinstance(other, self._recognized_scalars):
            other = Timedelta(other)
            if other is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                return result

            # otherwise, dispatch to Timedelta implementation
            return other / self._ndarray

        elif lib.is_scalar(other):
            raise TypeError(
                f"Cannot divide {type(other).__name__} by {type(self).__name__}"

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other.dtype):
            # let numpy handle it
            return other / self._ndarray

        elif is_object_dtype(other.dtype):
            # Note: unlike in __truediv__, we do not _need_ to do type
            #  inference on the result.  It does not raise, a numeric array
            #  is returned.  GH#23829
            result = [other[n] / self[n] for n in range(len(self))]
            return np.array(result)

            raise TypeError(
                f"Cannot divide {other.dtype} data by {type(self).__name__}"
Example #12
def masked_rec_array_to_mgr(data, index, columns, dtype: Optional[DtypeObj],
                            copy: bool):
    Extract from a masked rec array and create the manager.
    # essentially process a record array then fill it
    fill_value = data.fill_value
    fdata = ma.getdata(data)
    if index is None:
        index = get_names_from_index(fdata)
        if index is None:
            index = ibase.default_index(len(data))
    index = ensure_index(index)

    if columns is not None:
        columns = ensure_index(columns)
    arrays, arr_columns = to_arrays(fdata, columns)

    # fill if needed
    new_arrays = []
    for fv, arr, col in zip(fill_value, arrays, arr_columns):
        # TODO: numpy docs suggest fv must be scalar, but could it be
        #  non-scalar for object dtype?
        assert lib.is_scalar(fv), fv
        mask = ma.getmaskarray(data[col])
        if mask.any():
            arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
            arr[mask] = fv

    # create the manager
    arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns)
    if columns is None:
        columns = arr_columns

    mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)

    if copy:
        mgr = mgr.copy()
    return mgr
Example #13
        def logical_method(self, other):
            if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
                # Rely on pandas to unbox and dispatch to us.
                return NotImplemented

            assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
            other = lib.item_from_zerodim(other)
            other_is_booleanarray = isinstance(other, BooleanArray)
            other_is_scalar = lib.is_scalar(other)
            mask = None

            if other_is_booleanarray:
                other, mask = other._data, other._mask
            elif is_list_like(other):
                other = np.asarray(other, dtype="bool")
                if other.ndim > 1:
                    raise NotImplementedError(
                        "can only perform ops with 1-d structures"
                other, mask = coerce_to_array(other, copy=False)
            elif isinstance(other, np.bool_):
                other = other.item()

            if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)):
                raise TypeError(
                    "'other' should be pandas.NA or a bool. "
                    f"Got {type(other).__name__} instead."

            if not other_is_scalar and len(self) != len(other):
                raise ValueError("Lengths must match to compare")

            if op.__name__ in {"or_", "ror_"}:
                result, mask = ops.kleene_or(self._data, other, self._mask, mask)
            elif op.__name__ in {"and_", "rand_"}:
                result, mask = ops.kleene_and(self._data, other, self._mask, mask)
            elif op.__name__ in {"xor", "rxor"}:
                result, mask = ops.kleene_xor(self._data, other, self._mask, mask)

            return BooleanArray(result, mask)
Example #14
def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
    isna check that excludes incompatible dtypes

    obj : object
    dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype

    if not lib.is_scalar(obj) or not isna(obj):
        return False
    elif dtype.kind == "M":
        if isinstance(dtype, np.dtype):
            # i.e. not tzaware
            return not isinstance(obj, (np.timedelta64, Decimal))
        # we have to rule out tznaive dt64("NaT")
        return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal))
    elif dtype.kind == "m":
        return not isinstance(obj, (np.datetime64, Decimal))
    elif dtype.kind in ["i", "u", "f", "c"]:
        # Numeric
        return obj is not NaT and not isinstance(
            obj, (np.datetime64, np.timedelta64))

    elif dtype == np.dtype("object"):
        # This is needed for Categorical, but is kind of weird
        return True

    elif isinstance(dtype, PeriodDtype):
        return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))

    elif isinstance(dtype, IntervalDtype):
        return lib.is_float(obj) or obj is None or obj is libmissing.NA

    # fallback, default to allowing NaN, None, NA, NaT
    return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
Example #15
    def __getitem__(
        self: NDArrayBackedExtensionArrayT,
        key: PositionalIndexer2D,
    ) -> NDArrayBackedExtensionArrayT | Any:
        if lib.is_integer(key):
            # fast-path
            result = self._ndarray[key]
            if self.ndim == 1:
                return self._box_func(result)
            return self._from_backing_data(result)

        # error: Incompatible types in assignment (expression has type "ExtensionArray",
        # variable has type "Union[int, slice, ndarray]")
        key = extract_array(key,
                            extract_numpy=True)  # type: ignore[assignment]
        key = check_array_indexer(self, key)
        result = self._ndarray[key]
        if lib.is_scalar(result):
            return self._box_func(result)

        result = self._from_backing_data(result)
        return result
Example #16
def _sanitize_str_dtypes(result: np.ndarray, data, dtype: Optional[DtypeObj],
                         copy: bool) -> np.ndarray:
    Ensure we have a dtype that is supported by pandas.

    # This is to prevent mixed-type Series getting all casted to
    # NumPy string type, e.g. NaN --> '-1#IND'.
    if issubclass(result.dtype.type, str):
        # GH#16605
        # If not empty convert the data to dtype
        # GH#19853: If data is a scalar, result has already the result
        if not lib.is_scalar(data):
            if not np.all(isna(data)):
                # error: Argument "dtype" to "array" has incompatible type
                # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any],
                # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
                # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
                data = np.array(data, dtype=dtype,
                                copy=False)  # type: ignore[arg-type]
            result = np.array(data, dtype=object, copy=copy)
    return result
Example #17
def na_logical_op(x: np.ndarray, y, op):
        # For exposition, write:
        #  yarr = isinstance(y, np.ndarray)
        #  yint = is_integer(y) or (yarr and y.dtype.kind == "i")
        #  ybool = is_bool(y) or (yarr and y.dtype.kind == "b")
        #  xint = x.dtype.kind == "i"
        #  xbool = x.dtype.kind == "b"
        # Then Cases where this goes through without raising include:
        #  (xint or xbool) and (yint or bool)
        result = op(x, y)
    except TypeError:
        if isinstance(y, np.ndarray):
            # bool-bool dtype operations should be OK, should not get here
            assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype))
            x = ensure_object(x)
            y = ensure_object(y)
            result = libops.vec_binop(x.ravel(), y.ravel(), op)
            # let null fall thru
            assert lib.is_scalar(y)
            if not isna(y):
                y = bool(y)
                result = libops.scalar_binop(x, y, op)
            except (
                typ = type(y).__name__
                raise TypeError(
                    f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
                    f"and scalar of type [{typ}]"

    return result.reshape(x.shape)
Example #18
def is_valid_nat_for_dtype(obj, dtype):
    isna check that excludes incompatible dtypes

    obj : object
    dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype

    if not lib.is_scalar(obj) or not isna(obj):
        return False
    if dtype.kind == "M":
        return not isinstance(obj, np.timedelta64)
    if dtype.kind == "m":
        return not isinstance(obj, np.datetime64)

    # must be PeriodDType
    return not isinstance(obj, (np.datetime64, np.timedelta64))
Example #19
def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
    Wraper for np.percentile that skips missing values.

    values : array over which to find quantiles
    q : scalar or array of quantile indices to find
    axis : {0, 1}
    na_value : scalar
        value to return for empty or all-null values
    mask : ndarray[bool]
        locations in values that should be considered missing
    ndim : {1, 2}
    interpolation : str

    quantiles : scalar or array
    if not lib.is_scalar(mask) and mask.any():
        if ndim == 1:
            return _nanpercentile_1d(values, mask, q, na_value,
            # for nonconsolidatable blocks mask is 1D, but values 2D
            if mask.ndim < values.ndim:
                mask = mask.reshape(values.shape)
            if axis == 0:
                values = values.T
                mask = mask.T
            result = [_nanpercentile_1d(val, m, q, na_value,
                      for (val, m) in zip(list(values), list(mask))]
            result = np.array(result, dtype=values.dtype, copy=False).T
            return result
        return np.percentile(values, q, axis=axis, interpolation=interpolation)
Example #21
def _nanpercentile_1d(values, mask, q, na_value, interpolation):
    Wraper for np.percentile that skips missing values, specialized to
    1-dimensional case.

    values : array over which to find quantiles
    mask : ndarray[bool]
        locations in values that should be considered missing
    q : scalar or array of quantile indices to find
    na_value : scalar
        value to return for empty or all-null values
    interpolation : str

    quantiles : scalar or array
    # mask is Union[ExtensionArray, ndarray]
    if values.dtype.kind == "m":
        # need to cast to integer to avoid rounding errors in numpy
        result = _nanpercentile_1d(values.view("i8"), mask, q, na_value,

        # Note: we have to do do `astype` and not view because in general we
        #  have float result at this point, not i8
        return result.astype(values.dtype)

    values = values[~mask]

    if len(values) == 0:
        if lib.is_scalar(q):
            return na_value
            return np.array([na_value] * len(q), dtype=values.dtype)

    return np.percentile(values, q, interpolation=interpolation)
Example #22
    def __getitem__(
            self: NDArrayBackedExtensionArrayT, key: int | slice | np.ndarray
    ) -> NDArrayBackedExtensionArrayT | Any:
        if lib.is_integer(key):
            # fast-path
            result = self._ndarray[key]
            if self.ndim == 1:
                return self._box_func(result)
            return self._from_backing_data(result)

        # error: Value of type variable "AnyArrayLike" of "extract_array" cannot be
        # "Union[int, slice, ndarray]"
        # error: Incompatible types in assignment (expression has type "ExtensionArray",
        # variable has type "Union[int, slice, ndarray]")
        key = extract_array(  # type: ignore[type-var,assignment]
            key, extract_numpy=True)
        key = check_array_indexer(self, key)
        result = self._ndarray[key]
        if lib.is_scalar(result):
            return self._box_func(result)

        result = self._from_backing_data(result)
        return result
Example #23
    def _reconstruct(result):
        if lib.is_scalar(result):
            return result

        if result.ndim != self.ndim:
            if method == "outer":
                if self.ndim == 2:
                    # we already deprecated for Series
                    msg = ("outer method for ufunc {} is not implemented on "
                           "pandas objects. Returning an ndarray, but in the "
                           "future this will raise a 'NotImplementedError'. "
                           "Consider explicitly converting the DataFrame "
                           "to an array with '.to_numpy()' first.")
                    return result
                raise NotImplementedError
            return result
        if isinstance(result, BlockManager):
            # we went through BlockManager.apply e.g. np.sqrt
            result = self._constructor(result,
            # we converted an array, lost our axes
            result = self._constructor(result,
        # TODO: When we support multiple values in __finalize__, this
        # should pass alignable to `__finalize__` instead of self.
        # Then `np.add(a, b)` would consider attrs from both a and b
        # when a and b are NDFrames.
        if len(alignable) == 1:
            result = result.__finalize__(self)
        return result
Example #24
    def _logical_method(self, other, op):

        assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
        other_is_booleanarray = isinstance(other, BooleanArray)
        other_is_scalar = lib.is_scalar(other)
        mask = None

        if other_is_booleanarray:
            other, mask = other._data, other._mask
        elif is_list_like(other):
            other = np.asarray(other, dtype="bool")
            if other.ndim > 1:
                raise NotImplementedError(
                    "can only perform ops with 1-d structures")
            other, mask = coerce_to_array(other, copy=False)
        elif isinstance(other, np.bool_):
            other = other.item()

        if other_is_scalar and other is not libmissing.NA and not lib.is_bool(
            raise TypeError("'other' should be pandas.NA or a bool. "
                            f"Got {type(other).__name__} instead.")

        if not other_is_scalar and len(self) != len(other):
            raise ValueError("Lengths must match to compare")

        if op.__name__ in {"or_", "ror_"}:
            result, mask = ops.kleene_or(self._data, other, self._mask, mask)
        elif op.__name__ in {"and_", "rand_"}:
            result, mask = ops.kleene_and(self._data, other, self._mask, mask)
        elif op.__name__ in {"xor", "rxor"}:
            result, mask = ops.kleene_xor(self._data, other, self._mask, mask)

        # error: Argument 2 to "BooleanArray" has incompatible type "Optional[Any]";
        # expected "ndarray"
        return BooleanArray(result, mask)  # type: ignore[arg-type]
Example #25
def sanitize_array(data,
    Sanitize input data to an ndarray, copy if specified, coerce to the
    dtype if specified.
    if dtype is not None:
        dtype = pandas_dtype(dtype)

    if isinstance(data, ma.MaskedArray):
        mask = ma.getmaskarray(data)
        if mask.any():
            data, fill_value = maybe_upcast(data, copy=True)
            data.soften_mask()  # set hardmask False if it was True
            data[mask] = fill_value
            data = data.copy()

    data = extract_array(data, extract_numpy=True)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None:
            subarr = np.array(data, copy=False)

            # possibility of nan -> garbage
            if is_float_dtype(data.dtype) and is_integer_dtype(dtype):
                    subarr = _try_cast(data, True, dtype, copy, True)
                except ValueError:
                    if copy:
                        subarr = data.copy()
                subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
        elif isinstance(data, Index):
            # don't coerce Index types
            # e.g. indexes can have different conversions (so don't fast path
            # them)
            # GH#6140
            subarr = sanitize_index(data, index, copy=copy)

            # we will try to copy be-definition here
            subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)

    elif isinstance(data, ExtensionArray):
        if isinstance(data, ABCPandasArray):
            # We don't want to let people put our PandasArray wrapper
            # (the output of Series/Index.array), into a Series. So
            # we explicitly unwrap it here.
            subarr = data.to_numpy()
            subarr = data

        # everything else in this block must also handle ndarray's,
        # becuase we've unwrapped PandasArray into an ndarray.

        if dtype is not None:
            subarr = data.astype(dtype)

        if copy:
            subarr = data.copy()
        return subarr

    elif isinstance(data, (list, tuple)) and len(data) > 0:
        if dtype is not None:
                subarr = _try_cast(data, False, dtype, copy,
            except Exception:
                if raise_cast_failure:  # pragma: no cover
                subarr = np.array(data, dtype=object, copy=copy)
                subarr = lib.maybe_convert_objects(subarr)

            subarr = maybe_convert_platform(data)

        subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        start, stop, step = get_range_parameters(data)
        arr = np.arange(start, stop, step, dtype='int64')
        subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)
        subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)

    # scalar like, GH
    if getattr(subarr, 'ndim', 0) == 0:
        if isinstance(data, list):  # pragma: no cover
            subarr = np.array(data, dtype=object)
        elif index is not None:
            value = data

            # figure out the dtype from the value (upcast if necessary)
            if dtype is None:
                dtype, value = infer_dtype_from_scalar(value)
                # need to possibly convert the value here
                value = maybe_cast_to_datetime(value, dtype)

            subarr = construct_1d_arraylike_from_scalar(
                value, len(index), dtype)

            return subarr.item()

    # the result that we want
    elif subarr.ndim == 1:
        if index is not None:

            # a 1-element ndarray
            if len(subarr) != len(index) and len(subarr) == 1:
                subarr = construct_1d_arraylike_from_scalar(
                    subarr[0], len(index), subarr.dtype)

    elif subarr.ndim > 1:
        if isinstance(data, np.ndarray):
            raise Exception('Data must be 1-dimensional')
            subarr = com.asarray_tuplesafe(data, dtype=dtype)

    # This is to prevent mixed-type Series getting all casted to
    # NumPy string type, e.g. NaN --> '-1#IND'.
    if issubclass(subarr.dtype.type, compat.string_types):
        # GH#16605
        # If not empty convert the data to dtype
        # GH#19853: If data is a scalar, subarr has already the result
        if not lib.is_scalar(data):
            if not np.all(isna(data)):
                data = np.array(data, dtype=dtype, copy=False)
            subarr = np.array(data, dtype=object, copy=copy)

    if is_object_dtype(subarr.dtype) and dtype != 'object':
        inferred = lib.infer_dtype(subarr, skipna=False)
        if inferred == 'period':
                subarr = period_array(subarr)
            except IncompatibleFrequency:

    return subarr
Example #26
Example #27
def dispatch_to_series(left, right, func, str_rep=None, axis=None):
    Evaluate the frame operation func(left, right) by evaluating
    column-by-column, dispatching to the Series implementation.

    left : DataFrame
    right : scalar or DataFrame
    func : arithmetic or comparison operator
    str_rep : str or None, default None
    axis : {None, 0, 1, "index", "columns"}

    # Note: we use iloc to access columns for compat with cases
    #       with non-unique columns.
    import pandas.core.computation.expressions as expressions

    right = lib.item_from_zerodim(right)
    if lib.is_scalar(right) or np.ndim(right) == 0:

        # Get the appropriate array-op to apply to each block's values.
        array_op = get_array_op(func, str_rep=str_rep)
        bm = left._data.apply(array_op, right=right)
        return type(left)(bm)

    elif isinstance(right, ABCDataFrame):
        assert right._indexed_same(left)

        def column_op(a, b):
            return {
                i: func(a.iloc[:, i], b.iloc[:, i])
                for i in range(len(a.columns))

    elif isinstance(right, ABCSeries) and axis == "columns":
        # We only get here if called via _combine_series_frame,
        # in which case we specifically want to operate row-by-row
        assert right.index.equals(left.columns)

        if right.dtype == "timedelta64[ns]":
            # ensure we treat NaT values as the correct dtype
            # Note: we do not do this unconditionally as it may be lossy or
            #  expensive for EA dtypes.
            right = np.asarray(right)

            def column_op(a, b):
                return {
                    i: func(a.iloc[:, i], b[i])
                    for i in range(len(a.columns))


            def column_op(a, b):
                return {
                    i: func(a.iloc[:, i], b.iloc[i])
                    for i in range(len(a.columns))

    elif isinstance(right, ABCSeries):
        assert right.index.equals(left.index)  # Handle other cases later

        def column_op(a, b):
            return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}

        # Remaining cases have less-obvious dispatch rules
        raise NotImplementedError(right)

    new_data = expressions.evaluate(column_op, str_rep, left, right)
    return new_data
Example #28
    def __init__(
        if not is_scalar(default_fill_value):
            raise ValueError("'default_fill_value' must be a scalar")

        warnings.warn(depr_msg, FutureWarning, stacklevel=2)
        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, "name"):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = "block"

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if is_scipy_sparse(data):
            mgr = self._init_spmatrix(data,
        elif isinstance(data, dict):
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns, dtype=dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
        elif isinstance(data, Series):
            mgr = self._init_dict(data.to_frame(),
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data,
                                 axes=dict(index=index, columns=columns),
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
                index = ensure_index(index)

            if columns is None:
                columns = Index([])
                for c in columns:
                    data[c] = SparseArray(
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)
            msg = ('SparseDataFrame called with unknown type "{data_type}" '
                   "for data argument")
            raise TypeError(msg.format(data_type=type(data).__name__))

        generic.NDFrame.__init__(self, mgr)
Example #29
def dispatch_to_series(left, right, func, str_rep=None, axis=None):
    Evaluate the frame operation func(left, right) by evaluating
    column-by-column, dispatching to the Series implementation.

    left : DataFrame
    right : scalar or DataFrame
    func : arithmetic or comparison operator
    str_rep : str or None, default None
    axis : {None, 0, 1, "index", "columns"}

    # Note: we use iloc to access columns for compat with cases
    #       with non-unique columns.
    import pandas.core.computation.expressions as expressions

    right = lib.item_from_zerodim(right)
    if lib.is_scalar(right) or np.ndim(right) == 0:

        def column_op(a, b):
            return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}

    elif isinstance(right, ABCDataFrame):
        assert right._indexed_same(left)

        def column_op(a, b):
            return {
                i: func(a.iloc[:, i], b.iloc[:, i])
                for i in range(len(a.columns))

    elif isinstance(right, ABCSeries) and axis == "columns":
        # We only get here if called via left._combine_match_columns,
        # in which case we specifically want to operate row-by-row
        assert right.index.equals(left.columns)

        def column_op(a, b):
            return {
                i: func(a.iloc[:, i], b.iloc[i])
                for i in range(len(a.columns))

    elif isinstance(right, ABCSeries):
        assert right.index.equals(left.index)  # Handle other cases later

        def column_op(a, b):
            return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}

        # Remaining cases have less-obvious dispatch rules
        raise NotImplementedError(right)

    new_data = expressions.evaluate(column_op, str_rep, left, right)

    result = left._constructor(new_data, index=left.index, copy=False)
    # Pin columns instead of passing to constructor for compat with
    # non-unique columns case
    result.columns = left.columns
    return result
def dispatch_to_series(left, right, func, axis=None):
    Evaluate the frame operation func(left, right) by evaluating
    column-by-column, dispatching to the Series implementation.

    left : DataFrame
    right : scalar or DataFrame
    func : arithmetic or comparison operator
    axis : {None, 0, 1, "index", "columns"}

    # Get the appropriate array-op to apply to each column/block's values.
    array_op = get_array_op(func)

    right = lib.item_from_zerodim(right)
    if lib.is_scalar(right) or np.ndim(right) == 0:
        bm = left._mgr.apply(array_op, right=right)
        return type(left)(bm)

    elif isinstance(right, ABCDataFrame):
        assert left.index.equals(right.index)
        assert left.columns.equals(right.columns)
        # TODO: The previous assertion `assert right._indexed_same(left)`
        #  fails in cases with empty columns reached via
        #  _frame_arith_method_with_reindex

        bm = left._mgr.operate_blockwise(right._mgr, array_op)
        return type(left)(bm)

    elif isinstance(right, ABCSeries) and axis == 1:
        # axis=1 means we want to operate row-by-row
        assert right.index.equals(left.columns)

        if right.dtype == "timedelta64[ns]":
            # ensure we treat NaT values as the correct dtype
            # Note: we do not do this unconditionally as it may be lossy or
            #  expensive for EA dtypes.
            right = np.asarray(right)
            right = right._values
            # maybe_align_as_frame ensures we do not have an ndarray here
            assert not isinstance(right, np.ndarray)

        arrays = [
            array_op(l, r) for l, r in zip(left._iter_column_arrays(), right)

    elif isinstance(right, ABCSeries):
        assert right.index.equals(left.index)  # Handle other cases later
        right = right._values

        arrays = [array_op(l, right) for l in left._iter_column_arrays()]

        # Remaining cases have less-obvious dispatch rules
        raise NotImplementedError(right)

    return type(left)._from_arrays(arrays,
Example #31
    def _validate_setitem_value(self, value):
        value = extract_array(value, extract_numpy=True)

        if not lib.is_scalar(value):
            value = np.asarray(value, dtype=self._ndarray.dtype)
        return value
Example #32
def sanitize_array(data, index, dtype=None, copy=False,
    Sanitize input data to an ndarray, copy if specified, coerce to the
    dtype if specified.
    if dtype is not None:
        dtype = pandas_dtype(dtype)

    if isinstance(data, ma.MaskedArray):
        mask = ma.getmaskarray(data)
        if mask.any():
            data, fill_value = maybe_upcast(data, copy=True)
            data.soften_mask()  # set hardmask False if it was True
            data[mask] = fill_value
            data = data.copy()

    data = extract_array(data, extract_numpy=True)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None:
            subarr = np.array(data, copy=False)

            # possibility of nan -> garbage
            if is_float_dtype(data.dtype) and is_integer_dtype(dtype):
                    subarr = _try_cast(data, True, dtype, copy,
                except ValueError:
                    if copy:
                        subarr = data.copy()
                subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
        elif isinstance(data, Index):
            # don't coerce Index types
            # e.g. indexes can have different conversions (so don't fast path
            # them)
            # GH#6140
            subarr = sanitize_index(data, index, copy=copy)

            # we will try to copy be-definition here
            subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)

    elif isinstance(data, ExtensionArray):
        if isinstance(data, ABCPandasArray):
            # We don't want to let people put our PandasArray wrapper
            # (the output of Series/Index.array), into a Series. So
            # we explicitly unwrap it here.
            subarr = data.to_numpy()
            subarr = data

        # everything else in this block must also handle ndarray's,
        # becuase we've unwrapped PandasArray into an ndarray.

        if dtype is not None:
            subarr = data.astype(dtype)

        if copy:
            subarr = data.copy()
        return subarr

    elif isinstance(data, (list, tuple)) and len(data) > 0:
        if dtype is not None:
                subarr = _try_cast(data, False, dtype, copy,
            except Exception:
                if raise_cast_failure:  # pragma: no cover
                subarr = np.array(data, dtype=object, copy=copy)
                subarr = lib.maybe_convert_objects(subarr)

            subarr = maybe_convert_platform(data)

        subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        start, stop, step = get_range_parameters(data)
        arr = np.arange(start, stop, step, dtype='int64')
        subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)
        subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)

    # scalar like, GH
    if getattr(subarr, 'ndim', 0) == 0:
        if isinstance(data, list):  # pragma: no cover
            subarr = np.array(data, dtype=object)
        elif index is not None:
            value = data

            # figure out the dtype from the value (upcast if necessary)
            if dtype is None:
                dtype, value = infer_dtype_from_scalar(value)
                # need to possibly convert the value here
                value = maybe_cast_to_datetime(value, dtype)

            subarr = construct_1d_arraylike_from_scalar(
                value, len(index), dtype)

            return subarr.item()

    # the result that we want
    elif subarr.ndim == 1:
        if index is not None:

            # a 1-element ndarray
            if len(subarr) != len(index) and len(subarr) == 1:
                subarr = construct_1d_arraylike_from_scalar(
                    subarr[0], len(index), subarr.dtype)

    elif subarr.ndim > 1:
        if isinstance(data, np.ndarray):
            raise Exception('Data must be 1-dimensional')
            subarr = com.asarray_tuplesafe(data, dtype=dtype)

    # This is to prevent mixed-type Series getting all casted to
    # NumPy string type, e.g. NaN --> '-1#IND'.
    if issubclass(subarr.dtype.type, compat.string_types):
        # GH#16605
        # If not empty convert the data to dtype
        # GH#19853: If data is a scalar, subarr has already the result
        if not lib.is_scalar(data):
            if not np.all(isna(data)):
                data = np.array(data, dtype=dtype, copy=False)
            subarr = np.array(data, dtype=object, copy=copy)

    if is_object_dtype(subarr.dtype) and dtype != 'object':
        inferred = lib.infer_dtype(subarr, skipna=False)
        if inferred == 'period':
                subarr = period_array(subarr)
            except IncompatibleFrequency:

    return subarr
def sanitize_array(
    index: Optional["Index"],
    dtype: Optional[DtypeObj] = None,
    copy: bool = False,
    raise_cast_failure: bool = False,
) -> ArrayLike:
    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
    coerce to the dtype if specified.

    if isinstance(data, ma.MaskedArray):
        mask = ma.getmaskarray(data)
        if mask.any():
            data, fill_value = maybe_upcast(data, copy=True)
            data.soften_mask()  # set hardmask False if it was True
            data[mask] = fill_value
            data = data.copy()

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None and is_float_dtype(
                data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
                subarr = _try_cast(data, dtype, copy, True)
            except ValueError:
                if copy:
                    subarr = data.copy()
                    subarr = np.array(data, copy=False)
            # we will try to copy be-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()
        return subarr

    elif isinstance(data, (list, tuple)) and len(data) > 0:
        if dtype is not None:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
            subarr = maybe_convert_platform(data)

        subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        arr = np.arange(data.start, data.stop, data.step, dtype="int64")
        subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
    elif isinstance(data, abc.Set):
        raise TypeError("Set type is unordered")
    elif lib.is_scalar(data) and index is not None and dtype is not None:
        data = maybe_cast_to_datetime(data, dtype)
        if not lib.is_scalar(data):
            data = data[0]
        subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype)
        subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    # scalar like, GH
    if getattr(subarr, "ndim", 0) == 0:
        if isinstance(data, list):  # pragma: no cover
            subarr = np.array(data, dtype=object)
        elif index is not None:
            value = data

            # figure out the dtype from the value (upcast if necessary)
            if dtype is None:
                dtype, value = infer_dtype_from_scalar(value)
                # need to possibly convert the value here
                value = maybe_cast_to_datetime(value, dtype)

            subarr = construct_1d_arraylike_from_scalar(
                value, len(index), dtype)

            return subarr.item()

    # the result that we want
    elif subarr.ndim == 1:
        if index is not None:

            # a 1-element ndarray
            if len(subarr) != len(index) and len(subarr) == 1:
                subarr = construct_1d_arraylike_from_scalar(
                    subarr[0], len(index), subarr.dtype)

    elif subarr.ndim > 1:
        if isinstance(data, np.ndarray):
            raise Exception("Data must be 1-dimensional")
            subarr = com.asarray_tuplesafe(data, dtype=dtype)

    if not (is_extension_array_dtype(subarr.dtype)
            or is_extension_array_dtype(dtype)):
        # This is to prevent mixed-type Series getting all casted to
        # NumPy string type, e.g. NaN --> '-1#IND'.
        if issubclass(subarr.dtype.type, str):
            # GH#16605
            # If not empty convert the data to dtype
            # GH#19853: If data is a scalar, subarr has already the result
            if not lib.is_scalar(data):
                if not np.all(isna(data)):
                    data = np.array(data, dtype=dtype, copy=False)
                subarr = np.array(data, dtype=object, copy=copy)

        if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype):
            inferred = lib.infer_dtype(subarr, skipna=False)
            if inferred in {"interval", "period"}:
                subarr = array(subarr)

    return subarr
Example #34
    def __truediv__(self, other):
        # timedelta / X is well-defined for timedelta-like or numeric X

        if isinstance(other, self._recognized_scalars):
            other = Timedelta(other)
            # mypy assumes that __new__ returns an instance of the class
            # github.com/python/mypy/issues/1020
            if cast("Timedelta | NaTType", other) is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                return result

            # otherwise, dispatch to Timedelta implementation
            return self._ndarray / other

        elif lib.is_scalar(other):
            # assume it is numeric
            result = self._ndarray / other
            freq = None
            if self.freq is not None:
                # Tick division is not implemented, so operate on Timedelta
                freq = self.freq.delta / other
                freq = to_offset(freq)
            return type(self)._simple_new(result, dtype=result.dtype, freq=freq)

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other.dtype):
            # let numpy handle it
            return self._ndarray / other

        elif is_object_dtype(other.dtype):
            # We operate on raveled arrays to avoid problems in inference
            #  on NaT
            # TODO: tests with non-nano
            srav = self.ravel()
            orav = other.ravel()
            result_list = [srav[n] / orav[n] for n in range(len(srav))]
            result = np.array(result_list).reshape(self.shape)

            # We need to do dtype inference in order to keep DataFrame ops
            #  behavior consistent with Series behavior
            inferred = lib.infer_dtype(result, skipna=False)
            if inferred == "timedelta":
                flat = result.ravel()
                result = type(self)._from_sequence(flat).reshape(result.shape)
            elif inferred == "floating":
                result = result.astype(float)
            elif inferred == "datetime":
                # GH#39750 this occurs when result is all-NaT, in which case
                #  we want to interpret these NaTs as td64.
                #  We construct an all-td64NaT result.
                # error: Incompatible types in assignment (expression has type
                # "TimedeltaArray", variable has type "ndarray[Any,
                # dtype[floating[_64Bit]]]")
                result = self * np.nan  # type: ignore[assignment]

            return result

            result = self._ndarray / other
            return type(self)._simple_new(result, dtype=result.dtype)
Example #35
def array(data,         # type: Sequence[object]
          dtype=None,   # type: Optional[Union[str, np.dtype, ExtensionDtype]]
          copy=True,    # type: bool
    # type: (...) -> ExtensionArray
    Create an array.

    .. versionadded:: 0.24.0

    data : Sequence of objects
        The scalars inside `data` should be instances of the
        scalar type for `dtype`. It's expected that `data`
        represents a 1-dimensional array of data.

        When `data` is an Index or Series, the underlying array
        will be extracted from `data`.

    dtype : str, np.dtype, or ExtensionDtype, optional
        The dtype to use for the array. This may be a NumPy
        dtype or an extension type registered with pandas using

        If not specified, there are two possibilities:

        1. When `data` is a :class:`Series`, :class:`Index`, or
           :class:`ExtensionArray`, the `dtype` will be taken
           from the data.
        2. Otherwise, pandas will attempt to infer the `dtype`
           from the data.

        Note that when `data` is a NumPy array, ``data.dtype`` is
        *not* used for inferring the array type. This is because
        NumPy cannot represent all the types of data that can be
        held in extension arrays.

        Currently, pandas will infer an extension dtype for sequences of

        ============================== =====================================
        scalar type                    Array Type
        =============================  =====================================
        * :class:`pandas.Interval`     :class:`pandas.IntervalArray`
        * :class:`pandas.Period`       :class:`pandas.arrays.PeriodArray`
        * :class:`datetime.datetime`   :class:`pandas.arrays.DatetimeArray`
        * :class:`datetime.timedelta`  :class:`pandas.arrays.TimedeltaArray`
        =============================  =====================================

        For all other cases, NumPy's usual inference rules will be used.

    copy : bool, default True
        Whether to copy the data, even if not necessary. Depending
        on the type of `data`, creating the new array may require
        copying data, even if ``copy=False``.

        The newly created array.

        When `data` is not 1-dimensional.

    See Also
    numpy.array : Construct a NumPy array.
    arrays.PandasArray : ExtensionArray wrapping a NumPy array.
    Series : Construct a pandas Series.
    Index : Construct a pandas Index.

    Omitting the `dtype` argument means pandas will attempt to infer the
    best array type from the values in the data. As new array types are
    added by pandas and 3rd party libraries, the "best" array type may
    change. We recommend specifying `dtype` to ensure that

    1. the correct array type for the data is returned
    2. the returned array type doesn't change as new extension types
       are added by pandas and third-party libraries

    Additionally, if the underlying memory representation of the returned
    array matters, we recommend specifying the `dtype` as a concrete object
    rather than a string alias or allowing it to be inferred. For example,
    a future version of pandas or a 3rd-party library may include a
    dedicated ExtensionArray for string data. In this event, the following
    would no longer return a :class:`arrays.PandasArray` backed by a NumPy

    >>> pd.array(['a', 'b'], dtype=str)
    ['a', 'b']
    Length: 2, dtype: str32

    This would instead return the new ExtensionArray dedicated for string
    data. If you really need the new array to be backed by a  NumPy array,
    specify that in the dtype.

    >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
    ['a', 'b']
    Length: 2, dtype: str32

    Or use the dedicated constructor for the array you're expecting, and
    wrap that in a PandasArray

    >>> pd.array(np.array(['a', 'b'], dtype='<U1'))
    ['a', 'b']
    Length: 2, dtype: str32

    If a dtype is not specified, `data` is passed through to
    :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned.

    >>> pd.array([1, 2])
    [1, 2]
    Length: 2, dtype: int64

    Or the NumPy dtype can be specified

    >>> pd.array([1, 2], dtype=np.dtype("int32"))
    [1, 2]
    Length: 2, dtype: int32

    You can use the string alias for `dtype`

    >>> pd.array(['a', 'b', 'a'], dtype='category')
    [a, b, a]
    Categories (2, object): [a, b]

    Or specify the actual dtype

    >>> pd.array(['a', 'b', 'a'],
    ...          dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
    [a, b, a]
    Categories (3, object): [a < b < c]

    Because omitting the `dtype` passes the data through to NumPy,
    a mixture of valid integers and NA will return a floating-point
    NumPy array.

    >>> pd.array([1, 2, np.nan])
    [1.0,  2.0, nan]
    Length: 3, dtype: float64

    To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
    the dtype:

    >>> pd.array([1, 2, np.nan], dtype='Int64')
    [1, 2, NaN]
    Length: 3, dtype: Int64

    Pandas will infer an ExtensionArray for some types of data:

    >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
    ['2000-01-01', '2000-01-01']
    Length: 2, dtype: period[D]

    `data` must be 1-dimensional. A ValueError is raised when the input
    has the wrong dimensionality.

    >>> pd.array(1)
    Traceback (most recent call last):
    ValueError: Cannot pass scalar '1' to 'pandas.array'.
    from pandas.core.arrays import (
        period_array, ExtensionArray, IntervalArray, PandasArray,
    from pandas.core.internals.arrays import extract_array

    if lib.is_scalar(data):
        msg = (
            "Cannot pass scalar '{}' to 'pandas.array'."
        raise ValueError(msg.format(data))

    data = extract_array(data, extract_numpy=True)

    if dtype is None and isinstance(data, ExtensionArray):
        dtype = data.dtype

    # this returns None for not-found dtypes.
    if isinstance(dtype, compat.string_types):
        dtype = registry.find(dtype) or dtype

    if is_extension_array_dtype(dtype):
        cls = dtype.construct_array_type()
        return cls._from_sequence(data, dtype=dtype, copy=copy)

    if dtype is None:
        inferred_dtype = lib.infer_dtype(data)
        if inferred_dtype == 'period':
                return period_array(data, copy=copy)
            except tslibs.IncompatibleFrequency:
                # We may have a mixture of frequencies.
                # We choose to return an ndarray, rather than raising.
        elif inferred_dtype == 'interval':
                return IntervalArray(data, copy=copy)
            except ValueError:
                # We may have a mixture of `closed` here.
                # We choose to return an ndarray, rather than raising.

        elif inferred_dtype.startswith('datetime'):
            # datetime, datetime64
                return DatetimeArrayMixin._from_sequence(data, copy=copy)
            except ValueError:
                # Mixture of timezones, fall back to PandasArray

        elif inferred_dtype.startswith('timedelta'):
            # timedelta, timedelta64
            return TimedeltaArrayMixin._from_sequence(data, copy=copy)

        # TODO(BooleanArray): handle this type

    result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
    return result
Example #36
def array(data: Sequence[object],
          dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,
          copy: bool = True,
          ) -> ABCExtensionArray:
    Create an array.

    .. versionadded:: 0.24.0

    data : Sequence of objects
        The scalars inside `data` should be instances of the
        scalar type for `dtype`. It's expected that `data`
        represents a 1-dimensional array of data.

        When `data` is an Index or Series, the underlying array
        will be extracted from `data`.

    dtype : str, np.dtype, or ExtensionDtype, optional
        The dtype to use for the array. This may be a NumPy
        dtype or an extension type registered with pandas using

        If not specified, there are two possibilities:

        1. When `data` is a :class:`Series`, :class:`Index`, or
           :class:`ExtensionArray`, the `dtype` will be taken
           from the data.
        2. Otherwise, pandas will attempt to infer the `dtype`
           from the data.

        Note that when `data` is a NumPy array, ``data.dtype`` is
        *not* used for inferring the array type. This is because
        NumPy cannot represent all the types of data that can be
        held in extension arrays.

        Currently, pandas will infer an extension dtype for sequences of

        ============================== =====================================
        Scalar Type                    Array Type
        ============================== =====================================
        :class:`pandas.Interval`       :class:`pandas.arrays.IntervalArray`
        :class:`pandas.Period`         :class:`pandas.arrays.PeriodArray`
        :class:`datetime.datetime`     :class:`pandas.arrays.DatetimeArray`
        :class:`datetime.timedelta`    :class:`pandas.arrays.TimedeltaArray`
        ============================== =====================================

        For all other cases, NumPy's usual inference rules will be used.

    copy : bool, default True
        Whether to copy the data, even if not necessary. Depending
        on the type of `data`, creating the new array may require
        copying data, even if ``copy=False``.

        The newly created array.

        When `data` is not 1-dimensional.

    See Also
    numpy.array : Construct a NumPy array.
    Series : Construct a pandas Series.
    Index : Construct a pandas Index.
    arrays.PandasArray : ExtensionArray wrapping a NumPy array.
    Series.array : Extract the array stored within a Series.

    Omitting the `dtype` argument means pandas will attempt to infer the
    best array type from the values in the data. As new array types are
    added by pandas and 3rd party libraries, the "best" array type may
    change. We recommend specifying `dtype` to ensure that

    1. the correct array type for the data is returned
    2. the returned array type doesn't change as new extension types
       are added by pandas and third-party libraries

    Additionally, if the underlying memory representation of the returned
    array matters, we recommend specifying the `dtype` as a concrete object
    rather than a string alias or allowing it to be inferred. For example,
    a future version of pandas or a 3rd-party library may include a
    dedicated ExtensionArray for string data. In this event, the following
    would no longer return a :class:`arrays.PandasArray` backed by a NumPy

    >>> pd.array(['a', 'b'], dtype=str)
    ['a', 'b']
    Length: 2, dtype: str32

    This would instead return the new ExtensionArray dedicated for string
    data. If you really need the new array to be backed by a  NumPy array,
    specify that in the dtype.

    >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
    ['a', 'b']
    Length: 2, dtype: str32

    Or use the dedicated constructor for the array you're expecting, and
    wrap that in a PandasArray

    >>> pd.array(np.array(['a', 'b'], dtype='<U1'))
    ['a', 'b']
    Length: 2, dtype: str32

    Finally, Pandas has arrays that mostly overlap with NumPy

      * :class:`arrays.DatetimeArray`
      * :class:`arrays.TimedeltaArray`

    When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
    passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
    rather than a ``PandasArray``. This is for symmetry with the case of
    timezone-aware data, which NumPy does not natively support.

    >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
    ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
    Length: 2, dtype: datetime64[ns]

    >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
    ['01:00:00', '02:00:00']
    Length: 2, dtype: timedelta64[ns]

    If a dtype is not specified, `data` is passed through to
    :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned.

    >>> pd.array([1, 2])
    [1, 2]
    Length: 2, dtype: int64

    Or the NumPy dtype can be specified

    >>> pd.array([1, 2], dtype=np.dtype("int32"))
    [1, 2]
    Length: 2, dtype: int32

    You can use the string alias for `dtype`

    >>> pd.array(['a', 'b', 'a'], dtype='category')
    [a, b, a]
    Categories (2, object): [a, b]

    Or specify the actual dtype

    >>> pd.array(['a', 'b', 'a'],
    ...          dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
    [a, b, a]
    Categories (3, object): [a < b < c]

    Because omitting the `dtype` passes the data through to NumPy,
    a mixture of valid integers and NA will return a floating-point
    NumPy array.

    >>> pd.array([1, 2, np.nan])
    [1.0,  2.0, nan]
    Length: 3, dtype: float64

    To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify
    the dtype:

    >>> pd.array([1, 2, np.nan], dtype='Int64')
    [1, 2, NaN]
    Length: 3, dtype: Int64

    Pandas will infer an ExtensionArray for some types of data:

    >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
    ['2000-01-01', '2000-01-01']
    Length: 2, dtype: period[D]

    `data` must be 1-dimensional. A ValueError is raised when the input
    has the wrong dimensionality.

    >>> pd.array(1)
    Traceback (most recent call last):
    ValueError: Cannot pass scalar '1' to 'pandas.array'.
    from pandas.core.arrays import (
        period_array, ExtensionArray, IntervalArray, PandasArray,
    from pandas.core.internals.arrays import extract_array

    if lib.is_scalar(data):
        msg = (
            "Cannot pass scalar '{}' to 'pandas.array'."
        raise ValueError(msg.format(data))

    data = extract_array(data, extract_numpy=True)

    if dtype is None and isinstance(data, ExtensionArray):
        dtype = data.dtype

    # this returns None for not-found dtypes.
    if isinstance(dtype, str):
        dtype = registry.find(dtype) or dtype

    if is_extension_array_dtype(dtype):
        cls = dtype.construct_array_type()
        return cls._from_sequence(data, dtype=dtype, copy=copy)

    if dtype is None:
        inferred_dtype = lib.infer_dtype(data, skipna=False)
        if inferred_dtype == 'period':
                return period_array(data, copy=copy)
            except tslibs.IncompatibleFrequency:
                # We may have a mixture of frequencies.
                # We choose to return an ndarray, rather than raising.
        elif inferred_dtype == 'interval':
                return IntervalArray(data, copy=copy)
            except ValueError:
                # We may have a mixture of `closed` here.
                # We choose to return an ndarray, rather than raising.

        elif inferred_dtype.startswith('datetime'):
            # datetime, datetime64
                return DatetimeArray._from_sequence(data, copy=copy)
            except ValueError:
                # Mixture of timezones, fall back to PandasArray

        elif inferred_dtype.startswith('timedelta'):
            # timedelta, timedelta64
            return TimedeltaArray._from_sequence(data, copy=copy)

        # TODO(BooleanArray): handle this type

    # Pandas overrides NumPy for
    #   1. datetime64[ns]
    #   2. timedelta64[ns]
    # so that a DatetimeArray is returned.
    if is_datetime64_ns_dtype(dtype):
        return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
    elif is_timedelta64_ns_dtype(dtype):
        return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)

    result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
    return result
Example #37
def listify(x):
    if x is None:
        return []
    return [x] if is_scalar(x) else list(x)
Example #38
    def __truediv__(self, other):
        # timedelta / X is well-defined for timedelta-like or numeric X

        if isinstance(other, self._recognized_scalars):
            other = Timedelta(other)
            if other is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                return result

            # otherwise, dispatch to Timedelta implementation
            return self._ndarray / other

        elif lib.is_scalar(other):
            # assume it is numeric
            result = self._ndarray / other
            freq = None
            if self.freq is not None:
                # Tick division is not implemented, so operate on Timedelta
                freq = self.freq.delta / other
            return type(self)(result, freq=freq)

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other.dtype):
            # let numpy handle it
            return self._ndarray / other

        elif is_object_dtype(other.dtype):
            # We operate on raveled arrays to avoid problems in inference
            #  on NaT
            srav = self.ravel()
            orav = other.ravel()
            result = [srav[n] / orav[n] for n in range(len(srav))]
            result = np.array(result).reshape(self.shape)

            # We need to do dtype inference in order to keep DataFrame ops
            #  behavior consistent with Series behavior
            inferred = lib.infer_dtype(result, skipna=False)
            if inferred == "timedelta":
                flat = result.ravel()
                result = type(self)._from_sequence(flat).reshape(result.shape)
            elif inferred == "floating":
                result = result.astype(float)
            elif inferred == "datetime":
                # GH#39750 this occurs when result is all-NaT, in which case
                #  we want to interpret these NaTs as td64.
                #  We construct an all-td64NaT result.
                result = self * np.nan

            return result

            result = self._ndarray / other
            return type(self)(result)
def array(
    data: Union[Sequence[object], AnyArrayLike],
    dtype: Optional[Dtype] = None,
    copy: bool = True,
) -> "ExtensionArray":
    Create an array.

    .. versionadded:: 0.24.0

    data : Sequence of objects
        The scalars inside `data` should be instances of the
        scalar type for `dtype`. It's expected that `data`
        represents a 1-dimensional array of data.

        When `data` is an Index or Series, the underlying array
        will be extracted from `data`.

    dtype : str, np.dtype, or ExtensionDtype, optional
        The dtype to use for the array. This may be a NumPy
        dtype or an extension type registered with pandas using

        If not specified, there are two possibilities:

        1. When `data` is a :class:`Series`, :class:`Index`, or
           :class:`ExtensionArray`, the `dtype` will be taken
           from the data.
        2. Otherwise, pandas will attempt to infer the `dtype`
           from the data.

        Note that when `data` is a NumPy array, ``data.dtype`` is
        *not* used for inferring the array type. This is because
        NumPy cannot represent all the types of data that can be
        held in extension arrays.

        Currently, pandas will infer an extension dtype for sequences of

        ============================== =====================================
        Scalar Type                    Array Type
        ============================== =====================================
        :class:`pandas.Interval`       :class:`pandas.arrays.IntervalArray`
        :class:`pandas.Period`         :class:`pandas.arrays.PeriodArray`
        :class:`datetime.datetime`     :class:`pandas.arrays.DatetimeArray`
        :class:`datetime.timedelta`    :class:`pandas.arrays.TimedeltaArray`
        :class:`int`                   :class:`pandas.arrays.IntegerArray`
        :class:`str`                   :class:`pandas.arrays.StringArray`
        :class:`bool`                  :class:`pandas.arrays.BooleanArray`
        ============================== =====================================

        For all other cases, NumPy's usual inference rules will be used.

        .. versionchanged:: 1.0.0

           Pandas infers nullable-integer dtype for integer data,
           string dtype for string data, and nullable-boolean dtype
           for boolean data.

    copy : bool, default True
        Whether to copy the data, even if not necessary. Depending
        on the type of `data`, creating the new array may require
        copying data, even if ``copy=False``.

        The newly created array.

        When `data` is not 1-dimensional.

    See Also
    numpy.array : Construct a NumPy array.
    Series : Construct a pandas Series.
    Index : Construct a pandas Index.
    arrays.PandasArray : ExtensionArray wrapping a NumPy array.
    Series.array : Extract the array stored within a Series.

    Omitting the `dtype` argument means pandas will attempt to infer the
    best array type from the values in the data. As new array types are
    added by pandas and 3rd party libraries, the "best" array type may
    change. We recommend specifying `dtype` to ensure that

    1. the correct array type for the data is returned
    2. the returned array type doesn't change as new extension types
       are added by pandas and third-party libraries

    Additionally, if the underlying memory representation of the returned
    array matters, we recommend specifying the `dtype` as a concrete object
    rather than a string alias or allowing it to be inferred. For example,
    a future version of pandas or a 3rd-party library may include a
    dedicated ExtensionArray for string data. In this event, the following
    would no longer return a :class:`arrays.PandasArray` backed by a NumPy

    >>> pd.array(['a', 'b'], dtype=str)
    ['a', 'b']
    Length: 2, dtype: str32

    This would instead return the new ExtensionArray dedicated for string
    data. If you really need the new array to be backed by a  NumPy array,
    specify that in the dtype.

    >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
    ['a', 'b']
    Length: 2, dtype: str32

    Finally, Pandas has arrays that mostly overlap with NumPy

      * :class:`arrays.DatetimeArray`
      * :class:`arrays.TimedeltaArray`

    When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
    passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
    rather than a ``PandasArray``. This is for symmetry with the case of
    timezone-aware data, which NumPy does not natively support.

    >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
    ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
    Length: 2, dtype: datetime64[ns]

    >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
    ['0 days 01:00:00', '0 days 02:00:00']
    Length: 2, dtype: timedelta64[ns]

    If a dtype is not specified, pandas will infer the best dtype from the values.
    See the description of `dtype` for the types pandas infers for.

    >>> pd.array([1, 2])
    [1, 2]
    Length: 2, dtype: Int64

    >>> pd.array([1, 2, np.nan])
    [1, 2, <NA>]
    Length: 3, dtype: Int64

    >>> pd.array(["a", None, "c"])
    ['a', <NA>, 'c']
    Length: 3, dtype: string

    >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
    ['2000-01-01', '2000-01-01']
    Length: 2, dtype: period[D]

    You can use the string alias for `dtype`

    >>> pd.array(['a', 'b', 'a'], dtype='category')
    [a, b, a]
    Categories (2, object): [a, b]

    Or specify the actual dtype

    >>> pd.array(['a', 'b', 'a'],
    ...          dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
    [a, b, a]
    Categories (3, object): [a < b < c]

    If pandas does not infer a dedicated extension type a
    :class:`arrays.PandasArray` is returned.

    >>> pd.array([1.1, 2.2])
    [1.1, 2.2]
    Length: 2, dtype: float64

    As mentioned in the "Notes" section, new extension types may be added
    in the future (by pandas or 3rd party libraries), causing the return
    value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype`
    as a NumPy dtype if you need to ensure there's no future change in

    >>> pd.array([1, 2], dtype=np.dtype("int32"))
    [1, 2]
    Length: 2, dtype: int32

    `data` must be 1-dimensional. A ValueError is raised when the input
    has the wrong dimensionality.

    >>> pd.array(1)
    Traceback (most recent call last):
    ValueError: Cannot pass scalar '1' to 'pandas.array'.
    from pandas.core.arrays import (

    if lib.is_scalar(data):
        msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
        raise ValueError(msg)

    if dtype is None and isinstance(
            data, (ABCSeries, ABCIndexClass, ABCExtensionArray)):
        dtype = data.dtype

    data = extract_array(data, extract_numpy=True)

    # this returns None for not-found dtypes.
    if isinstance(dtype, str):
        dtype = registry.find(dtype) or dtype

    if is_extension_array_dtype(dtype):
        cls = cast(ExtensionDtype, dtype).construct_array_type()
        return cls._from_sequence(data, dtype=dtype, copy=copy)

    if dtype is None:
        inferred_dtype = lib.infer_dtype(data, skipna=True)
        if inferred_dtype == "period":
                return period_array(data, copy=copy)
            except IncompatibleFrequency:
                # We may have a mixture of frequencies.
                # We choose to return an ndarray, rather than raising.
        elif inferred_dtype == "interval":
                return IntervalArray(data, copy=copy)
            except ValueError:
                # We may have a mixture of `closed` here.
                # We choose to return an ndarray, rather than raising.

        elif inferred_dtype.startswith("datetime"):
            # datetime, datetime64
                return DatetimeArray._from_sequence(data, copy=copy)
            except ValueError:
                # Mixture of timezones, fall back to PandasArray

        elif inferred_dtype.startswith("timedelta"):
            # timedelta, timedelta64
            return TimedeltaArray._from_sequence(data, copy=copy)

        elif inferred_dtype == "string":
            return StringArray._from_sequence(data, copy=copy)

        elif inferred_dtype == "integer":
            return IntegerArray._from_sequence(data, copy=copy)

        elif inferred_dtype == "boolean":
            return BooleanArray._from_sequence(data, copy=copy)

    # Pandas overrides NumPy for
    #   1. datetime64[ns]
    #   2. timedelta64[ns]
    # so that a DatetimeArray is returned.
    if is_datetime64_ns_dtype(dtype):
        return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
    elif is_timedelta64_ns_dtype(dtype):
        return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)

    result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)
    return result
Example #40
def nanpercentile(
    values: np.ndarray,
    axis: int,
    mask: np.ndarray,
    ndim: int,
    Wrapper for np.percentile that skips missing values.

    values : array over which to find quantiles
    q : scalar or array of quantile indices to find
    axis : {0, 1}
    na_value : scalar
        value to return for empty or all-null values
    mask : ndarray[bool]
        locations in values that should be considered missing
    ndim : {1, 2}
    interpolation : str

    quantiles : scalar or array
    if values.dtype.kind in ["m", "M"]:
        # need to cast to integer to avoid rounding errors in numpy
        result = nanpercentile(values.view("i8"), q, axis, na_value.view("i8"),
                               mask, ndim, interpolation)

        # Note: we have to do do `astype` and not view because in general we
        #  have float result at this point, not i8
        return result.astype(values.dtype)

    if not lib.is_scalar(mask) and mask.any():
        if ndim == 1:
            return _nanpercentile_1d(values,
            # for nonconsolidatable blocks mask is 1D, but values 2D
            if mask.ndim < values.ndim:
                mask = mask.reshape(values.shape)
            if axis == 0:
                values = values.T
                mask = mask.T
            result = [
                for (val, m) in zip(list(values), list(mask))
            result = np.array(result, dtype=values.dtype, copy=False).T
            return result
        return np.percentile(values, q, axis=axis, interpolation=interpolation)