def _ensure_datetimelike_to_i8(other, to_utc=False): """ Helper for coercing an input scalar or array to i8. Parameters ---------- other : 1d array to_utc : bool, default False If True, convert the values to UTC before extracting the i8 values If False, extract the i8 values directly. Returns ------- i8 1d array """ from pandas import Index from pandas.core.arrays import PeriodArray if lib.is_scalar(other) and isna(other): return iNaT elif isinstance(other, (PeriodArray, ABCIndexClass)): # convert tz if needed if getattr(other, 'tz', None) is not None: if to_utc: other = other.tz_convert('UTC') else: other = other.tz_localize(None) else: try: return np.array(other, copy=False).view('i8') except TypeError: # period array cannot be coerced to int other = Index(other) return other.asi8
def _nanpercentile_1d(values, mask, q, na_value, interpolation): """ Wraper for np.percentile that skips missing values, specialized to 1-dimensional case. Parameters ---------- values : array over which to find quantiles mask : ndarray[bool] locations in values that should be considered missing q : scalar or array of quantile indices to find na_value : scalar value to return for empty or all-null values interpolation : str Returns ------- quantiles : scalar or array """ # mask is Union[ExtensionArray, ndarray] values = values[~mask] if len(values) == 0: if lib.is_scalar(q): return na_value else: return np.array([na_value] * len(q), dtype=values.dtype) return np.percentile(values, q, interpolation=interpolation)
def _evaluate_compare(self, other, op): """ We have been called because a comparison between 8 aware arrays. numpy >= 1.11 will now warn about NaT comparisons """ # Called by comparison methods when comparing datetimelike # with datetimelike if not isinstance(other, type(self)): # coerce to a similar object if not is_list_like(other): # scalar other = [other] elif lib.is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) # compare result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly mask = (self._isnan) | (other._isnan) filler = iNaT if is_bool_dtype(result): filler = False result[mask] = filler return result
def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray result = self._ndarray[item] if not lib.is_scalar(result): result = type(self)(result) return result
def __setitem__(self, key, value): from pandas.core.internals.arrays import extract_array value = extract_array(value, extract_numpy=True) if not lib.is_scalar(key) and is_list_like(key): key = np.asarray(key) if not lib.is_scalar(value): value = np.asarray(value) values = self._ndarray t = np.result_type(value, values) if t != self._ndarray.dtype: values = values.astype(t, casting='safe') values[key] = value self._dtype = PandasDtype(t) self._ndarray = values else: self._ndarray[key] = value
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # Lightly modified version of # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/\ # numpy.lib.mixins.NDArrayOperatorsMixin.html # The primary modification is not boxing scalar return values # in PandasArray, since pandas' ExtensionArrays are 1-d. out = kwargs.get('out', ()) for x in inputs + out: # Only support operations with instances of _HANDLED_TYPES. # Use PandasArray instead of type(self) for isinstance to # allow subclasses that don't override __array_ufunc__ to # handle PandasArray objects. if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)): return NotImplemented # Defer to the implementation of the ufunc on unwrapped values. inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs) if out: kwargs['out'] = tuple( x._ndarray if isinstance(x, PandasArray) else x for x in out) result = getattr(ufunc, method)(*inputs, **kwargs) if type(result) is tuple and len(result): # multiple return values if not lib.is_scalar(result[0]): # re-box array-like results return tuple(type(self)(x) for x in result) else: # but not scalar reductions return result elif method == 'at': # no return value return None else: # one return value if not lib.is_scalar(result): # re-box array-like results, but not scalar reductions result = type(self)(result) return result
def __getitem__(self, key): """ This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars """ is_int = lib.is_integer(key) if lib.is_scalar(key) and not is_int: raise IndexError("only integers, slices (`:`), ellipsis (`...`), " "numpy.newaxis (`None`) and integer or boolean " "arrays are valid indices") getitem = self._data.__getitem__ if is_int: val = getitem(key) return self._box_func(val) if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) if key.all(): key = slice(0, None, None) else: key = lib.maybe_booleans_to_slice(key.view(np.uint8)) attribs = self._get_attributes_dict() is_period = is_period_dtype(self) if is_period: freq = self.freq else: freq = None if isinstance(key, slice): if self.freq is not None and key.step is not None: freq = key.step * self.freq else: freq = self.freq elif key is Ellipsis: # GH#21282 indexing with Ellipsis is similar to a full slice, # should preserve `freq` attribute freq = self.freq attribs['freq'] = freq result = getitem(key) if result.ndim > 1: # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition if is_period: return self._simple_new(result, **attribs) return result return self._simple_new(result, **attribs)
def wrapper(self, other): meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if isinstance(other, (datetime, np.datetime64, compat.string_types)): if isinstance(other, (datetime, np.datetime64)): # GH#18435 strings get a pass from tzawareness compat self._assert_tzawareness_compat(other) try: other = _to_m8(other, tz=self.tz) except ValueError: # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif lib.is_scalar(other): return ops.invalid_comparison(self, other, op) else: if isinstance(other, list): # FIXME: This can break for object-dtype with mixed types other = type(self)(other) elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) if is_object_dtype(other): result = op(self.astype('O'), np.array(other)) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) result = meth(self, np.asarray(other)) result = com.values_from_object(result) # Make sure to pass an array to result[...]; indexing with # Series breaks with older version of numpy o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return self._data / other elif lib.is_scalar(other): # assume it is numeric result = self._data / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta freq = self.freq.delta / other return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other): # let numpy handle it return self._data / other elif is_object_dtype(other): # Note: we do not do type inference on the result, so either # an object array or numeric-dtyped (if numpy does inference) # will be returned. GH#23829 result = [self[n] / other[n] for n in range(len(self))] result = np.array(result) return result else: result = self._data / other return type(self)(result)
def __rtruediv__(self, other): # X / timedelta is defined only for timedelta-like X other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return other / self._data elif lib.is_scalar(other): raise TypeError("Cannot divide {typ} by {cls}" .format(typ=type(other).__name__, cls=type(self).__name__)) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other): # let numpy handle it return other / self._data elif is_object_dtype(other): # Note: unlike in __truediv__, we do not _need_ to do type# # inference on the result. It does not raise, a numeric array # is returned. GH#23829 result = [other[n] / self[n] for n in range(len(self))] return np.array(result) else: raise TypeError("Cannot divide {dtype} data by {cls}" .format(dtype=other.dtype, cls=type(self).__name__))
def __rtruediv__(self, other): # X / timedelta is defined only for timedelta-like X if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return other / self._ndarray elif lib.is_scalar(other): raise TypeError( f"Cannot divide {type(other).__name__} by {type(self).__name__}" ) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other.dtype): # let numpy handle it return other / self._ndarray elif is_object_dtype(other.dtype): # Note: unlike in __truediv__, we do not _need_ to do type # inference on the result. It does not raise, a numeric array # is returned. GH#23829 result = [other[n] / self[n] for n in range(len(self))] return np.array(result) else: raise TypeError( f"Cannot divide {other.dtype} data by {type(self).__name__}" )
def masked_rec_array_to_mgr(data, index, columns, dtype: Optional[DtypeObj], copy: bool): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fill_value = data.fill_value fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): # TODO: numpy docs suggest fv must be scalar, but could it be # non-scalar for object dtype? assert lib.is_scalar(fv), fv mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv new_arrays.append(arr) # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() return mgr
def logical_method(self, other): if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} other = lib.item_from_zerodim(other) other_is_booleanarray = isinstance(other, BooleanArray) other_is_scalar = lib.is_scalar(other) mask = None if other_is_booleanarray: other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other, dtype="bool") if other.ndim > 1: raise NotImplementedError( "can only perform ops with 1-d structures" ) other, mask = coerce_to_array(other, copy=False) elif isinstance(other, np.bool_): other = other.item() if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): raise TypeError( "'other' should be pandas.NA or a bool. " f"Got {type(other).__name__} instead." ) if not other_is_scalar and len(self) != len(other): raise ValueError("Lengths must match to compare") if op.__name__ in {"or_", "ror_"}: result, mask = ops.kleene_or(self._data, other, self._mask, mask) elif op.__name__ in {"and_", "rand_"}: result, mask = ops.kleene_and(self._data, other, self._mask, mask) elif op.__name__ in {"xor", "rxor"}: result, mask = ops.kleene_xor(self._data, other, self._mask, mask) return BooleanArray(result, mask)
def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: """ isna check that excludes incompatible dtypes Parameters ---------- obj : object dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype Returns ------- bool """ if not lib.is_scalar(obj) or not isna(obj): return False elif dtype.kind == "M": if isinstance(dtype, np.dtype): # i.e. not tzaware return not isinstance(obj, (np.timedelta64, Decimal)) # we have to rule out tznaive dt64("NaT") return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal)) elif dtype.kind == "m": return not isinstance(obj, (np.datetime64, Decimal)) elif dtype.kind in ["i", "u", "f", "c"]: # Numeric return obj is not NaT and not isinstance( obj, (np.datetime64, np.timedelta64)) elif dtype == np.dtype("object"): # This is needed for Categorical, but is kind of weird return True elif isinstance(dtype, PeriodDtype): return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) elif isinstance(dtype, IntervalDtype): return lib.is_float(obj) or obj is None or obj is libmissing.NA # fallback, default to allowing NaN, None, NA, NaT return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
def __getitem__( self: NDArrayBackedExtensionArrayT, key: PositionalIndexer2D, ) -> NDArrayBackedExtensionArrayT | Any: if lib.is_integer(key): # fast-path result = self._ndarray[key] if self.ndim == 1: return self._box_func(result) return self._from_backing_data(result) # error: Incompatible types in assignment (expression has type "ExtensionArray", # variable has type "Union[int, slice, ndarray]") key = extract_array(key, extract_numpy=True) # type: ignore[assignment] key = check_array_indexer(self, key) result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) result = self._from_backing_data(result) return result
def _sanitize_str_dtypes(result: np.ndarray, data, dtype: Optional[DtypeObj], copy: bool) -> np.ndarray: """ Ensure we have a dtype that is supported by pandas. """ # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(result.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, result has already the result if not lib.is_scalar(data): if not np.all(isna(data)): # error: Argument "dtype" to "array" has incompatible type # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any], # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" data = np.array(data, dtype=dtype, copy=False) # type: ignore[arg-type] result = np.array(data, dtype=object, copy=copy) return result
def na_logical_op(x: np.ndarray, y, op): try: # For exposition, write: # yarr = isinstance(y, np.ndarray) # yint = is_integer(y) or (yarr and y.dtype.kind == "i") # ybool = is_bool(y) or (yarr and y.dtype.kind == "b") # xint = x.dtype.kind == "i" # xbool = x.dtype.kind == "b" # Then Cases where this goes through without raising include: # (xint or xbool) and (yint or bool) result = op(x, y) except TypeError: if isinstance(y, np.ndarray): # bool-bool dtype operations should be OK, should not get here assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)) x = ensure_object(x) y = ensure_object(y) result = libops.vec_binop(x.ravel(), y.ravel(), op) else: # let null fall thru assert lib.is_scalar(y) if not isna(y): y = bool(y) try: result = libops.scalar_binop(x, y, op) except ( TypeError, ValueError, AttributeError, OverflowError, NotImplementedError, ): typ = type(y).__name__ raise TypeError( f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " f"and scalar of type [{typ}]" ) return result.reshape(x.shape)
def is_valid_nat_for_dtype(obj, dtype): """ isna check that excludes incompatible dtypes Parameters ---------- obj : object dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype Returns ------- bool """ if not lib.is_scalar(obj) or not isna(obj): return False if dtype.kind == "M": return not isinstance(obj, np.timedelta64) if dtype.kind == "m": return not isinstance(obj, np.datetime64) # must be PeriodDType return not isinstance(obj, (np.datetime64, np.timedelta64))
def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): """ Wraper for np.percentile that skips missing values. Parameters ---------- values : array over which to find quantiles q : scalar or array of quantile indices to find axis : {0, 1} na_value : scalar value to return for empty or all-null values mask : ndarray[bool] locations in values that should be considered missing ndim : {1, 2} interpolation : str Returns ------- quantiles : scalar or array """ if not lib.is_scalar(mask) and mask.any(): if ndim == 1: return _nanpercentile_1d(values, mask, q, na_value, interpolation=interpolation) else: # for nonconsolidatable blocks mask is 1D, but values 2D if mask.ndim < values.ndim: mask = mask.reshape(values.shape) if axis == 0: values = values.T mask = mask.T result = [_nanpercentile_1d(val, m, q, na_value, interpolation=interpolation) for (val, m) in zip(list(values), list(mask))] result = np.array(result, dtype=values.dtype, copy=False).T return result else: return np.percentile(values, q, axis=axis, interpolation=interpolation)
def _nanpercentile_1d(values, mask, q, na_value, interpolation): """ Wraper for np.percentile that skips missing values, specialized to 1-dimensional case. Parameters ---------- values : array over which to find quantiles mask : ndarray[bool] locations in values that should be considered missing q : scalar or array of quantile indices to find na_value : scalar value to return for empty or all-null values interpolation : str Returns ------- quantiles : scalar or array """ # mask is Union[ExtensionArray, ndarray] if values.dtype.kind == "m": # need to cast to integer to avoid rounding errors in numpy result = _nanpercentile_1d(values.view("i8"), mask, q, na_value, interpolation) # Note: we have to do do `astype` and not view because in general we # have float result at this point, not i8 return result.astype(values.dtype) values = values[~mask] if len(values) == 0: if lib.is_scalar(q): return na_value else: return np.array([na_value] * len(q), dtype=values.dtype) return np.percentile(values, q, interpolation=interpolation)
def __getitem__( self: NDArrayBackedExtensionArrayT, key: int | slice | np.ndarray ) -> NDArrayBackedExtensionArrayT | Any: if lib.is_integer(key): # fast-path result = self._ndarray[key] if self.ndim == 1: return self._box_func(result) return self._from_backing_data(result) # error: Value of type variable "AnyArrayLike" of "extract_array" cannot be # "Union[int, slice, ndarray]" # error: Incompatible types in assignment (expression has type "ExtensionArray", # variable has type "Union[int, slice, ndarray]") key = extract_array( # type: ignore[type-var,assignment] key, extract_numpy=True) key = check_array_indexer(self, key) result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) result = self._from_backing_data(result) return result
def _reconstruct(result): if lib.is_scalar(result): return result if result.ndim != self.ndim: if method == "outer": if self.ndim == 2: # we already deprecated for Series msg = ("outer method for ufunc {} is not implemented on " "pandas objects. Returning an ndarray, but in the " "future this will raise a 'NotImplementedError'. " "Consider explicitly converting the DataFrame " "to an array with '.to_numpy()' first.") warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=find_stack_level()) return result raise NotImplementedError return result if isinstance(result, BlockManager): # we went through BlockManager.apply e.g. np.sqrt result = self._constructor(result, **reconstruct_kwargs, copy=False) else: # we converted an array, lost our axes result = self._constructor(result, **reconstruct_axes, **reconstruct_kwargs, copy=False) # TODO: When we support multiple values in __finalize__, this # should pass alignable to `__finalize__` instead of self. # Then `np.add(a, b)` would consider attrs from both a and b # when a and b are NDFrames. if len(alignable) == 1: result = result.__finalize__(self) return result
def _logical_method(self, other, op): assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} other_is_booleanarray = isinstance(other, BooleanArray) other_is_scalar = lib.is_scalar(other) mask = None if other_is_booleanarray: other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other, dtype="bool") if other.ndim > 1: raise NotImplementedError( "can only perform ops with 1-d structures") other, mask = coerce_to_array(other, copy=False) elif isinstance(other, np.bool_): other = other.item() if other_is_scalar and other is not libmissing.NA and not lib.is_bool( other): raise TypeError("'other' should be pandas.NA or a bool. " f"Got {type(other).__name__} instead.") if not other_is_scalar and len(self) != len(other): raise ValueError("Lengths must match to compare") if op.__name__ in {"or_", "ror_"}: result, mask = ops.kleene_or(self._data, other, self._mask, mask) elif op.__name__ in {"and_", "rand_"}: result, mask = ops.kleene_and(self._data, other, self._mask, mask) elif op.__name__ in {"xor", "rxor"}: result, mask = ops.kleene_xor(self._data, other, self._mask, mask) # error: Argument 2 to "BooleanArray" has incompatible type "Optional[Any]"; # expected "ndarray" return BooleanArray(result, mask) # type: ignore[arg-type]
def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None: subarr = np.array(data, copy=False) # possibility of nan -> garbage if is_float_dtype(data.dtype) and is_integer_dtype(dtype): try: subarr = _try_cast(data, True, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, Index): # don't coerce Index types # e.g. indexes can have different conversions (so don't fast path # them) # GH#6140 subarr = sanitize_index(data, index, copy=copy) else: # we will try to copy be-definition here subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): if isinstance(data, ABCPandasArray): # We don't want to let people put our PandasArray wrapper # (the output of Series/Index.array), into a Series. So # we explicitly unwrap it here. subarr = data.to_numpy() else: subarr = data # everything else in this block must also handle ndarray's, # becuase we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) if copy: subarr = data.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) except Exception: if raise_cast_failure: # pragma: no cover raise subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 start, stop, step = get_range_parameters(data) arr = np.arange(start, stop, step, dtype='int64') subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') else: subarr = com.asarray_tuplesafe(data, dtype=dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, compat.string_types): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and dtype != 'object': inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def time_is_scalar(self, param): is_scalar(param)
def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating column-by-column, dispatching to the Series implementation. Parameters ---------- left : DataFrame right : scalar or DataFrame func : arithmetic or comparison operator str_rep : str or None, default None axis : {None, 0, 1, "index", "columns"} Returns ------- DataFrame """ # Note: we use iloc to access columns for compat with cases # with non-unique columns. import pandas.core.computation.expressions as expressions right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: # Get the appropriate array-op to apply to each block's values. array_op = get_array_op(func, str_rep=str_rep) bm = left._data.apply(array_op, right=right) return type(left)(bm) elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) def column_op(a, b): return { i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns)) } elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via _combine_series_frame, # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) if right.dtype == "timedelta64[ns]": # ensure we treat NaT values as the correct dtype # Note: we do not do this unconditionally as it may be lossy or # expensive for EA dtypes. right = np.asarray(right) def column_op(a, b): return { i: func(a.iloc[:, i], b[i]) for i in range(len(a.columns)) } else: def column_op(a, b): return { i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns)) } elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later def column_op(a, b): return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} else: # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) new_data = expressions.evaluate(column_op, str_rep, left, right) return new_data
def __init__( self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False, ): if not is_scalar(default_fill_value): raise ValueError("'default_fill_value' must be a scalar") warnings.warn(depr_msg, FutureWarning, stacklevel=2) # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, "name"): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = "block" self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): mgr = self._init_dict(data.to_frame(), data.index, columns=None, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray( self._default_fill_value, index=index, kind=self._default_kind, fill_value=self._default_fill_value, ) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: msg = ('SparseDataFrame called with unknown type "{data_type}" ' "for data argument") raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr)
def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ Evaluate the frame operation func(left, right) by evaluating column-by-column, dispatching to the Series implementation. Parameters ---------- left : DataFrame right : scalar or DataFrame func : arithmetic or comparison operator str_rep : str or None, default None axis : {None, 0, 1, "index", "columns"} Returns ------- DataFrame """ # Note: we use iloc to access columns for compat with cases # with non-unique columns. import pandas.core.computation.expressions as expressions right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: def column_op(a, b): return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) def column_op(a, b): return { i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns)) } elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via left._combine_match_columns, # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) def column_op(a, b): return { i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns)) } elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later def column_op(a, b): return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} else: # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) new_data = expressions.evaluate(column_op, str_rep, left, right) result = left._constructor(new_data, index=left.index, copy=False) # Pin columns instead of passing to constructor for compat with # non-unique columns case result.columns = left.columns return result
def dispatch_to_series(left, right, func, axis=None): """ Evaluate the frame operation func(left, right) by evaluating column-by-column, dispatching to the Series implementation. Parameters ---------- left : DataFrame right : scalar or DataFrame func : arithmetic or comparison operator axis : {None, 0, 1, "index", "columns"} Returns ------- DataFrame """ # Get the appropriate array-op to apply to each column/block's values. array_op = get_array_op(func) right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: bm = left._mgr.apply(array_op, right=right) return type(left)(bm) elif isinstance(right, ABCDataFrame): assert left.index.equals(right.index) assert left.columns.equals(right.columns) # TODO: The previous assertion `assert right._indexed_same(left)` # fails in cases with empty columns reached via # _frame_arith_method_with_reindex bm = left._mgr.operate_blockwise(right._mgr, array_op) return type(left)(bm) elif isinstance(right, ABCSeries) and axis == 1: # axis=1 means we want to operate row-by-row assert right.index.equals(left.columns) if right.dtype == "timedelta64[ns]": # ensure we treat NaT values as the correct dtype # Note: we do not do this unconditionally as it may be lossy or # expensive for EA dtypes. right = np.asarray(right) else: right = right._values # maybe_align_as_frame ensures we do not have an ndarray here assert not isinstance(right, np.ndarray) arrays = [ array_op(l, r) for l, r in zip(left._iter_column_arrays(), right) ] elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later right = right._values arrays = [array_op(l, right) for l in left._iter_column_arrays()] else: # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) return type(left)._from_arrays(arrays, left.columns, left.index, verify_integrity=False)
def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) if not lib.is_scalar(value): value = np.asarray(value, dtype=self._ndarray.dtype) return value
def sanitize_array( data, index: Optional["Index"], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif isinstance(data, abc.Set): raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): data = data[0] subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X if isinstance(other, self._recognized_scalars): other = Timedelta(other) # mypy assumes that __new__ returns an instance of the class # github.com/python/mypy/issues/1020 if cast("Timedelta | NaTType", other) is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return self._ndarray / other elif lib.is_scalar(other): # assume it is numeric result = self._ndarray / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta freq = self.freq.delta / other freq = to_offset(freq) return type(self)._simple_new(result, dtype=result.dtype, freq=freq) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other.dtype): # let numpy handle it return self._ndarray / other elif is_object_dtype(other.dtype): # We operate on raveled arrays to avoid problems in inference # on NaT # TODO: tests with non-nano srav = self.ravel() orav = other.ravel() result_list = [srav[n] / orav[n] for n in range(len(srav))] result = np.array(result_list).reshape(self.shape) # We need to do dtype inference in order to keep DataFrame ops # behavior consistent with Series behavior inferred = lib.infer_dtype(result, skipna=False) if inferred == "timedelta": flat = result.ravel() result = type(self)._from_sequence(flat).reshape(result.shape) elif inferred == "floating": result = result.astype(float) elif inferred == "datetime": # GH#39750 this occurs when result is all-NaT, in which case # we want to interpret these NaTs as td64. # We construct an all-td64NaT result. # error: Incompatible types in assignment (expression has type # "TimedeltaArray", variable has type "ndarray[Any, # dtype[floating[_64Bit]]]") result = self * np.nan # type: ignore[assignment] return result else: result = self._ndarray / other return type(self)._simple_new(result, dtype=result.dtype)
def array(data, # type: Sequence[object] dtype=None, # type: Optional[Union[str, np.dtype, ExtensionDtype]] copy=True, # type: bool ): # type: (...) -> ExtensionArray """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== scalar type Array Type ============================= ===================================== * :class:`pandas.Interval` :class:`pandas.IntervalArray` * :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` * :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` * :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` ============================= ===================================== For all other cases, NumPy's usual inference rules will be used. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Or use the dedicated constructor for the array you're expecting, and wrap that in a PandasArray >>> pd.array(np.array(['a', 'b'], dtype='<U1')) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Examples -------- If a dtype is not specified, `data` is passed through to :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. >>> pd.array([1, 2]) <PandasArray> [1, 2] Length: 2, dtype: int64 Or the NumPy dtype can be specified >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] Because omitting the `dtype` passes the data through to NumPy, a mixture of valid integers and NA will return a floating-point NumPy array. >>> pd.array([1, 2, np.nan]) <PandasArray> [1.0, 2.0, nan] Length: 3, dtype: float64 To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify the dtype: >>> pd.array([1, 2, np.nan], dtype='Int64') <IntegerArray> [1, 2, NaN] Length: 3, dtype: Int64 Pandas will infer an ExtensionArray for some types of data: >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray, PandasArray, DatetimeArrayMixin, TimedeltaArrayMixin, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): msg = ( "Cannot pass scalar '{}' to 'pandas.array'." ) raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) if dtype is None and isinstance(data, ExtensionArray): dtype = data.dtype # this returns None for not-found dtypes. if isinstance(dtype, compat.string_types): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data) if inferred_dtype == 'period': try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == 'interval': try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith('datetime'): # datetime, datetime64 try: return DatetimeArrayMixin._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith('timedelta'): # timedelta, timedelta64 return TimedeltaArrayMixin._from_sequence(data, copy=copy) # TODO(BooleanArray): handle this type result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def array(data: Sequence[object], dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, copy: bool = True, ) -> ABCExtensionArray: """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== Scalar Type Array Type ============================== ===================================== :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Or use the dedicated constructor for the array you're expecting, and wrap that in a PandasArray >>> pd.array(np.array(['a', 'b'], dtype='<U1')) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['01:00:00', '02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, `data` is passed through to :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. >>> pd.array([1, 2]) <PandasArray> [1, 2] Length: 2, dtype: int64 Or the NumPy dtype can be specified >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] Because omitting the `dtype` passes the data through to NumPy, a mixture of valid integers and NA will return a floating-point NumPy array. >>> pd.array([1, 2, np.nan]) <PandasArray> [1.0, 2.0, nan] Length: 3, dtype: float64 To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify the dtype: >>> pd.array([1, 2, np.nan], dtype='Int64') <IntegerArray> [1, 2, NaN] Length: 3, dtype: Int64 Pandas will infer an ExtensionArray for some types of data: >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): msg = ( "Cannot pass scalar '{}' to 'pandas.array'." ) raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) if dtype is None and isinstance(data, ExtensionArray): dtype = data.dtype # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=False) if inferred_dtype == 'period': try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == 'interval': try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith('datetime'): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith('timedelta'): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) # TODO(BooleanArray): handle this type # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def listify(x): if x is None: return [] return [x] if is_scalar(x) else list(x)
def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return self._ndarray / other elif lib.is_scalar(other): # assume it is numeric result = self._ndarray / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta freq = self.freq.delta / other return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other.dtype): # let numpy handle it return self._ndarray / other elif is_object_dtype(other.dtype): # We operate on raveled arrays to avoid problems in inference # on NaT srav = self.ravel() orav = other.ravel() result = [srav[n] / orav[n] for n in range(len(srav))] result = np.array(result).reshape(self.shape) # We need to do dtype inference in order to keep DataFrame ops # behavior consistent with Series behavior inferred = lib.infer_dtype(result, skipna=False) if inferred == "timedelta": flat = result.ravel() result = type(self)._from_sequence(flat).reshape(result.shape) elif inferred == "floating": result = result.astype(float) elif inferred == "datetime": # GH#39750 this occurs when result is all-NaT, in which case # we want to interpret these NaTs as td64. # We construct an all-td64NaT result. result = self * np.nan return result else: result = self._ndarray / other return type(self)(result)
def array( data: Union[Sequence[object], AnyArrayLike], dtype: Optional[Dtype] = None, copy: bool = True, ) -> "ExtensionArray": """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== Scalar Type Array Type ============================== ===================================== :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. .. versionchanged:: 1.0.0 Pandas infers nullable-integer dtype for integer data, string dtype for string data, and nullable-boolean dtype for boolean data. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, pandas will infer the best dtype from the values. See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) <IntegerArray> [1, 2] Length: 2, dtype: Int64 >>> pd.array([1, 2, np.nan]) <IntegerArray> [1, 2, <NA>] Length: 3, dtype: Int64 >>> pd.array(["a", None, "c"]) <StringArray> ['a', <NA>, 'c'] Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. >>> pd.array([1.1, 2.2]) <PandasArray> [1.1, 2.2] Length: 2, dtype: float64 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` as a NumPy dtype if you need to ensure there's no future change in behavior. >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, BooleanArray, IntegerArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, StringArray, ) if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." raise ValueError(msg) if dtype is None and isinstance( data, (ABCSeries, ABCIndexClass, ABCExtensionArray)): dtype = data.dtype data = extract_array(data, extract_numpy=True) # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = cast(ExtensionDtype, dtype).construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) except IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == "interval": try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith("datetime"): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith("timedelta"): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": return StringArray._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def nanpercentile( values: np.ndarray, q, axis: int, na_value, mask: np.ndarray, ndim: int, interpolation, ): """ Wrapper for np.percentile that skips missing values. Parameters ---------- values : array over which to find quantiles q : scalar or array of quantile indices to find axis : {0, 1} na_value : scalar value to return for empty or all-null values mask : ndarray[bool] locations in values that should be considered missing ndim : {1, 2} interpolation : str Returns ------- quantiles : scalar or array """ if values.dtype.kind in ["m", "M"]: # need to cast to integer to avoid rounding errors in numpy result = nanpercentile(values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation) # Note: we have to do do `astype` and not view because in general we # have float result at this point, not i8 return result.astype(values.dtype) if not lib.is_scalar(mask) and mask.any(): if ndim == 1: return _nanpercentile_1d(values, mask, q, na_value, interpolation=interpolation) else: # for nonconsolidatable blocks mask is 1D, but values 2D if mask.ndim < values.ndim: mask = mask.reshape(values.shape) if axis == 0: values = values.T mask = mask.T result = [ _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation) for (val, m) in zip(list(values), list(mask)) ] result = np.array(result, dtype=values.dtype, copy=False).T return result else: return np.percentile(values, q, axis=axis, interpolation=interpolation)