def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) result = lib.infer_dtype(arr) assert result == 'decimal' arr = np.array([1.0, 2.0, Decimal(3)]) result = lib.infer_dtype(arr) assert result == 'mixed'
def test_unicode(self): arr = [u'a', np.nan, u'c'] result = lib.infer_dtype(arr) assert result == 'mixed' arr = [u'a', np.nan, u'c'] result = lib.infer_dtype(arr, skipna=True) expected = 'unicode' if PY2 else 'string' assert result == expected
def test_object(self): # GH 7431 # cannot infer more than this as only a single element arr = np.array([None], dtype='O') result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' result = lib.infer_dtype(arr, skipna=True) assert result == 'empty'
def test_unicode(self): arr = ['a', np.nan, 'c'] result = lib.infer_dtype(arr, skipna=False) assert result == 'mixed' arr = ['a', np.nan, 'c'] result = lib.infer_dtype(arr, skipna=True) expected = 'string' assert result == expected
def test_infer_dtype_bytes(self): compare = 'string' if PY2 else 'bytes' # string array of bytes arr = np.array(list('abc'), dtype='S1') assert lib.infer_dtype(arr) == compare # object array of bytes arr = arr.astype(object) assert lib.infer_dtype(arr) == compare
def test_date(self): dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) assert index.inferred_type == 'date' dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] result = lib.infer_dtype(dates) assert result == 'mixed' result = lib.infer_dtype(dates, skipna=True) assert result == 'date'
def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) assert result == 'integer' result = lib.infer_dtype([]) assert result == 'empty' # GH 18004 arr = np.array([np.array([], dtype=object), np.array([], dtype=object)]) result = lib.infer_dtype(arr) assert result == 'empty'
def test_integers(self): arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') result = lib.infer_dtype(arr) assert result == 'integer' arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') result = lib.infer_dtype(arr) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='i4') result = lib.infer_dtype(arr) assert result == 'integer'
def test_infer_dtype_bytes(self): compare = 'string' if PY2 else 'bytes' # string array of bytes arr = np.array(list('abc'), dtype='S1') assert lib.infer_dtype(arr) == compare # object array of bytes arr = arr.astype(object) assert lib.infer_dtype(arr) == compare # object array of bytes with missing values assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
def is_datetime_arraylike(arr): """ Check whether an array-like is a datetime array-like or DatetimeIndex. Parameters ---------- arr : array-like The array-like to check. Returns ------- boolean : Whether or not the array-like is a datetime array-like or DatetimeIndex. Examples -------- >>> is_datetime_arraylike([1, 2, 3]) False >>> is_datetime_arraylike(pd.Index([1, 2, 3])) False >>> is_datetime_arraylike(pd.DatetimeIndex([1, 2, 3])) True """ if isinstance(arr, ABCDatetimeIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' return getattr(arr, 'inferred_type', None) == 'datetime'
def test_bools(self): arr = np.array([True, False, True, True, True], dtype='O') result = lib.infer_dtype(arr) assert result == 'boolean' arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') result = lib.infer_dtype(arr) assert result == 'boolean' arr = np.array([True, False, True, 'foo'], dtype='O') result = lib.infer_dtype(arr) assert result == 'mixed' arr = np.array([True, False, True], dtype=bool) result = lib.infer_dtype(arr) assert result == 'boolean'
def test_deprecation(self): # GH 24050 arr = np.array([1, 2, 3], dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = lib.infer_dtype(arr) # default: skipna=None -> warn assert result == 'integer'
def is_period_arraylike(arr): """ return if we are period arraylike / PeriodIndex """ if isinstance(arr, ABCPeriodIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return arr.dtype == object and lib.infer_dtype(arr) == 'period' return getattr(arr, 'inferred_type', None) == 'period'
def is_datetime_arraylike(arr): """ return if we are datetime arraylike / DatetimeIndex """ if isinstance(arr, ABCDatetimeIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' return getattr(arr, 'inferred_type', None) == 'datetime'
def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer Parameters ---------- bins : list-liek of bins dtype : dtype of data Raises ------ ValueError if bins are not of a compat dtype to dtype """ bins_dtype = infer_dtype(bins) if is_timedelta64_dtype(dtype): if bins_dtype in ['timedelta', 'timedelta64']: bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") return bins
def test_object(self): # GH 7431 # cannot infer more than this as only a single element arr = np.array([None], dtype='O') result = lib.infer_dtype(arr) self.assertEqual(result, 'mixed')
def is_period_arraylike(arr): """ Check whether an array-like is a periodical array-like or PeriodIndex. Parameters ---------- arr : array-like The array-like to check. Returns ------- boolean : Whether or not the array-like is a periodical array-like or PeriodIndex instance. Examples -------- >>> is_period_arraylike([1, 2, 3]) False >>> is_period_arraylike(pd.Index([1, 2, 3])) False >>> is_period_arraylike(pd.PeriodIndex(["2017-01-01"], freq="D")) True """ if isinstance(arr, ABCPeriodIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return arr.dtype == object and lib.infer_dtype(arr) == 'period' return getattr(arr, 'inferred_type', None) == 'period'
def test_bools(self): arr = np.array([True, False, True, True, True], dtype='O') result = lib.infer_dtype(arr) self.assertEqual(result, 'boolean') arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') result = lib.infer_dtype(arr) self.assertEqual(result, 'boolean') arr = np.array([True, False, True, 'foo'], dtype='O') result = lib.infer_dtype(arr) self.assertEqual(result, 'mixed') arr = np.array([True, False, True], dtype=bool) result = lib.infer_dtype(arr) self.assertEqual(result, 'boolean')
def test_categorical(self): # GH 8974 from pandas import Categorical, Series arr = Categorical(list('abc')) result = lib.infer_dtype(arr) assert result == 'categorical' result = lib.infer_dtype(Series(arr)) assert result == 'categorical' arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) result = lib.infer_dtype(arr) assert result == 'categorical' result = lib.infer_dtype(Series(arr)) assert result == 'categorical'
def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([1.0, 2.0, Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal'
def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') result = lib.infer_dtype(arr) assert result == 'floating' arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], dtype='O') result = lib.infer_dtype(arr) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='f4') result = lib.infer_dtype(arr) assert result == 'floating' arr = np.array([1, 2, 3, 4, 5], dtype='f8') result = lib.infer_dtype(arr) assert result == 'floating'
def infer_dtype_from_array(arr, pandas_dtype=False): """ infer the dtype from a scalar or array Parameters ---------- arr : scalar or array pandas_dtype : bool, default False whether to infer dtype including pandas extension types. If False, array belongs to pandas extension types is inferred as object Returns ------- tuple (numpy-compat/pandas-compat dtype, array) Notes ----- if pandas_dtype=False. these infer to numpy dtypes exactly with the exception that mixed / object dtypes are not coerced by stringifying or conversion if pandas_dtype=True. datetime64tz-aware/categorical types will retain there character. Examples -------- >>> np.asarray([1, '1']) array(['1', '1'], dtype='<U21') >>> infer_dtype_from_array([1, '1']) (numpy.object_, [1, '1']) """ if isinstance(arr, np.ndarray): return arr.dtype, arr if not is_list_like(arr): arr = [arr] if pandas_dtype and is_extension_type(arr): return arr.dtype, arr elif isinstance(arr, ABCSeries): return arr.dtype, np.asarray(arr) # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr) if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return (np.object_, arr) arr = np.asarray(arr) return arr.dtype, arr
def _convert_bin_to_numeric_type(x): """ if the passed bin is of datetime/timedelta type, this method converts it to integer """ dtype = infer_dtype(x) if dtype == 'timedelta' or dtype == 'timedelta64': x = to_timedelta(x).view(np.int64) elif dtype == 'datetime' or dtype == 'datetime64': x = to_datetime(x).view(np.int64) return x
def test_infer_dtype_period(self): # GH 13664 arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='D')]) assert lib.infer_dtype(arr) == 'period' arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='M')]) assert lib.infer_dtype(arr) == 'period' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Period('2011-01', freq='D')]) assert lib.infer_dtype(arr) == 'period' arr = np.array([n, pd.Period('2011-01', freq='D'), n]) assert lib.infer_dtype(arr) == 'period' # different type of nat arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], dtype=object) assert lib.infer_dtype(arr) == 'mixed' arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr) == 'mixed'
def test_complex(self, skipna): # gets cast to complex on array construction arr = np.array([1.0, 2.0, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' arr = np.array([1.0, 2.0, 1 + 1j], dtype='O') result = lib.infer_dtype(arr, skipna=skipna) assert result == 'mixed' # gets cast to complex on array construction arr = np.array([1, np.nan, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' arr = np.array([1.0, np.nan, 1 + 1j], dtype='O') result = lib.infer_dtype(arr, skipna=skipna) assert result == 'mixed' # complex with nans stays complex arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype='O') result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex' # test smaller complex dtype; will pass through _try_infer_map fastpath arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) result = lib.infer_dtype(arr, skipna=skipna) assert result == 'complex'
def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped element to provide proper block construction """ if not is_list_like(val): val = [val] val = np.array(val, copy=False) if is_datetimelike(val): return np.array('NaT', dtype=val.dtype) elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(ensure_object(val), skipna=False) if dtype in ['datetime', 'datetime64']: return np.array('NaT', dtype=_NS_DTYPE) elif dtype in ['timedelta', 'timedelta64']: return np.array('NaT', dtype=_TD_DTYPE) return np.nan
def _convert_1d(values, units, axis): if not hasattr(axis, 'freq'): raise TypeError('Axis must have `freq` set to convert to Periods') valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) if (isinstance(values, valid_types) or is_integer(values) or is_float(values)): return get_datevalue(values, axis.freq) elif isinstance(values, PeriodIndex): return values.asfreq(axis.freq)._ndarray_values elif isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) elif lib.infer_dtype(values, skipna=False) == 'period': # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex return PeriodIndex(values, freq=axis.freq)._ndarray_values elif isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values
def infer_dtype_from_array(arr): """ infer the dtype from a scalar or array Parameters ---------- arr : scalar or array Returns ------- tuple (numpy-compat dtype, array) Notes ----- These infer to numpy dtypes exactly with the exception that mixed / object dtypes are not coerced by stringifying or conversion Examples -------- >>> np.asarray([1, '1']) array(['1', '1'], dtype='<U21') >>> infer_dtype_from_array([1, '1']) (numpy.object_, [1, '1']) """ if isinstance(arr, np.ndarray): return arr.dtype, arr if not is_list_like(arr): arr = [arr] # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr) if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']: return (np.object_, arr) arr = np.asarray(arr) return arr.dtype, arr
def test_infer_dtype_datetime(self): arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array( [np.datetime64('2011-01-01'), np.datetime64('2011-01-01')], dtype=object) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1)]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, pd.Timestamp('2011-01-02'), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02'), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1), n]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # different type of nat arr = np.array([np.timedelta64('nat'), np.datetime64('2011-01-02')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.datetime64('2011-01-02'), np.timedelta64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' # mixed datetime arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'datetime' # should be datetime? arr = np.array( [np.datetime64('2011-01-01'), pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array( [pd.Timestamp('2011-01-02'), np.datetime64('2011-01-01')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) assert lib.infer_dtype(arr, skipna=True) == 'mixed'
def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X if isinstance(other, self._recognized_scalars): other = Timedelta(other) # mypy assumes that __new__ returns an instance of the class # github.com/python/mypy/issues/1020 if cast("Timedelta | NaTType", other) is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return self._ndarray / other elif lib.is_scalar(other): # assume it is numeric result = self._ndarray / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta freq = self.freq.delta / other freq = to_offset(freq) return type(self)._simple_new(result, dtype=result.dtype, freq=freq) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other.dtype): # let numpy handle it return self._ndarray / other elif is_object_dtype(other.dtype): # We operate on raveled arrays to avoid problems in inference # on NaT # TODO: tests with non-nano srav = self.ravel() orav = other.ravel() result_list = [srav[n] / orav[n] for n in range(len(srav))] result = np.array(result_list).reshape(self.shape) # We need to do dtype inference in order to keep DataFrame ops # behavior consistent with Series behavior inferred = lib.infer_dtype(result, skipna=False) if inferred == "timedelta": flat = result.ravel() result = type(self)._from_sequence(flat).reshape(result.shape) elif inferred == "floating": result = result.astype(float) elif inferred == "datetime": # GH#39750 this occurs when result is all-NaT, in which case # we want to interpret these NaTs as td64. # We construct an all-td64NaT result. # error: Incompatible types in assignment (expression has type # "TimedeltaArray", variable has type "ndarray[Any, # dtype[floating[_64Bit]]]") result = self * np.nan # type: ignore[assignment] return result else: result = self._ndarray / other return type(self)._simple_new(result, dtype=result.dtype)
def convert_dtypes( input_array, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, to types supporting ``pd.NA``. Parameters ---------- input_array : ExtensionArray or PandasArray convert_string : bool, default True Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. Returns ------- dtype new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: # Required to catch due to Period. Can remove once GH 23553 is fixed inferred_dtype = input_array.dtype if not convert_string and is_string_dtype(inferred_dtype): inferred_dtype = input_array.dtype if convert_integer: target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import _dtypes inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( input_array.dtype): inferred_dtype = target_int_dtype else: if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" else: if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": inferred_dtype = input_array.dtype else: inferred_dtype = input_array.dtype return inferred_dtype
def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) assert lib.infer_dtype(arr) == 'floating' # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) assert lib.infer_dtype(arr) == 'mixed' arr = np.array([None, np.nan, np.nan]) assert lib.infer_dtype(arr) == 'mixed' # pd.NaT arr = np.array([pd.NaT]) assert lib.infer_dtype(arr) == 'datetime' arr = np.array([pd.NaT, np.nan]) assert lib.infer_dtype(arr) == 'datetime' arr = np.array([np.nan, pd.NaT]) assert lib.infer_dtype(arr) == 'datetime' arr = np.array([np.nan, pd.NaT, np.nan]) assert lib.infer_dtype(arr) == 'datetime' arr = np.array([None, pd.NaT, None]) assert lib.infer_dtype(arr) == 'datetime' # np.datetime64(nat) arr = np.array([np.datetime64('nat')]) assert lib.infer_dtype(arr) == 'datetime64' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.datetime64('nat'), n]) assert lib.infer_dtype(arr) == 'datetime64' arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) assert lib.infer_dtype(arr) == 'datetime64' arr = np.array([np.timedelta64('nat')], dtype=object) assert lib.infer_dtype(arr) == 'timedelta' for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.timedelta64('nat'), n]) assert lib.infer_dtype(arr) == 'timedelta' arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) assert lib.infer_dtype(arr) == 'timedelta' # datetime / timedelta mixed arr = np.array([pd.NaT, np.datetime64('nat'), np.timedelta64('nat'), np.nan]) assert lib.infer_dtype(arr) == 'mixed' arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr) == 'mixed'
def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) self.assertEqual(result, 'integer') result = lib.infer_dtype([]) self.assertEqual(result, 'empty')
def test_infer_dtype_timedelta(self): arr = np.array([pd.Timedelta('1 days'), pd.Timedelta('2 days')]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([timedelta(1), timedelta(2)]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, Timedelta('1 days')]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([n, np.timedelta64(1, 'D')]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([n, timedelta(1)]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([n, pd.Timedelta('1 days'), n]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([n, np.timedelta64(1, 'D'), n]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([n, timedelta(1), n]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') # different type of nat arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed')
def maybe_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 """ do_round = False if is_scalar(result): return result elif isinstance(result, ABCDataFrame): # occurs in pivot_table doctest return result if isinstance(dtype, str): if dtype == "infer": inferred_type = lib.infer_dtype(ensure_object(result.ravel()), skipna=False) if inferred_type == "boolean": dtype = "bool" elif inferred_type == "integer": dtype = "int64" elif inferred_type == "datetime64": dtype = "datetime64[ns]" elif inferred_type == "timedelta64": dtype = "timedelta64[ns]" # try to upcast here elif inferred_type == "floating": dtype = "int64" if issubclass(result.dtype.type, np.number): do_round = True else: dtype = "object" dtype = np.dtype(dtype) converted = maybe_downcast_numeric(result, dtype, do_round) if converted is not result: return converted # a datetimelike # GH12821, iNaT is casted to float if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: if hasattr(dtype, "tz"): # not a numpy dtype if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime result = to_datetime(result).tz_localize("utc") result = result.tz_convert(dtype.tz) else: result = result.astype(dtype) elif dtype.type is Period: # TODO(DatetimeArray): merge with previous elif from pandas.core.arrays import PeriodArray try: return PeriodArray(result, freq=dtype.freq) except TypeError: # e.g. TypeError: int() argument must be a string, a # bytes-like object or a number, not 'Period pass return result
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError("{} dtype not supported".format( values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} operations".format( how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} operations".format( how)) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def coerce_to_array(values, dtype, mask=None, copy=False): """ Coerce the input values array to numpy arrays with a mask Parameters ---------- values : 1D list-like dtype : integer dtype mask : boolean 1D array, optional copy : boolean, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is integer numpy array, preserve it's dtype if dtype is None and hasattr(values, 'dtype'): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: if (isinstance(dtype, string_types) and (dtype.startswith("Int") or dtype.startswith("UInt"))): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: dtype = _dtypes[str(np.dtype(dtype))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) if isinstance(values, IntegerArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values) if inferred_type is 'mixed' and isna(values).all(): values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in [ 'floating', 'integer', 'mixed-integer', 'mixed-integer-float' ]: raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) if mask is None: mask = isna(values) else: assert len(mask) == len(values) if not values.ndim == 1: raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype('int64') else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = 1 values = safe_cast(values, dtype, copy=False) else: values = safe_cast(values, dtype, copy=False) return values, mask
def coerce_to_array( values, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ if isinstance(values, BooleanArray): if mask is not None: raise ValueError("cannot pass mask for BooleanArray input") values, mask = values._data, values._mask if copy: values = values.copy() mask = mask.copy() return values, mask mask_values = None if isinstance(values, np.ndarray) and values.dtype == np.bool_: if copy: values = values.copy() elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype): mask_values = isna(values) values_bool = np.zeros(len(values), dtype=bool) values_bool[~mask_values] = values[~mask_values].astype(bool) if not np.all( values_bool[~mask_values].astype(values.dtype) == values[~mask_values] ): raise TypeError("Need to pass bool-like values") values = values_bool else: values_object = np.asarray(values, dtype=object) inferred_dtype = lib.infer_dtype(values_object, skipna=True) integer_like = ("floating", "integer", "mixed-integer-float") if inferred_dtype not in ("boolean", "empty") + integer_like: raise TypeError("Need to pass bool-like values") mask_values = isna(values_object) values = np.zeros(len(values), dtype=bool) values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's if inferred_dtype in integer_like: if not np.all( values[~mask_values].astype(float) == values_object[~mask_values].astype(float) ): raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) elif mask is None: mask = mask_values else: if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: if mask_values is not None: mask = mask | mask_values else: if copy: mask = mask.copy() else: mask = np.array(mask, dtype=bool) if mask_values is not None: mask = mask | mask_values if not values.ndim == 1: raise ValueError("values must be a 1D list-like") if not mask.ndim == 1: raise ValueError("mask must be a 1D list-like") return values, mask
def test_object_empty(self, box, missing, dtype, skipna, expected): # GH 23421 arr = box([missing, missing], dtype=dtype) result = lib.infer_dtype(arr, skipna=skipna) assert result == expected
def __floordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented other = lib.item_from_zerodim(other) if is_scalar(other): if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other return type(self)(result.view('m8[ns]'), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.int64) result[mask] = np.nan return result elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == 'timedelta': result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other) or is_float_dtype(other): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError("Cannot divide {typ} by {cls}" .format(typ=dtype, cls=type(self).__name__))
def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, tz=None, dtype=None, copy=False, name=None, **fields): valid_field_set = {'year', 'month', 'day', 'quarter', 'hour', 'minute', 'second'} if not set(fields).issubset(valid_field_set): raise TypeError('__new__() got an unexpected keyword argument {}'. format(list(set(fields) - valid_field_set)[0])) if periods is not None: if is_float(periods): periods = int(periods) elif not is_integer(periods): msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) if name is None and hasattr(data, 'name'): name = data.name if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): raise ValueError('dtype must be PeriodDtype') if freq is None: freq = dtype.freq elif freq != dtype.freq: msg = 'specified freq and dtype are different' raise IncompatibleFrequency(msg) # coerce freq to freq object, otherwise it can be coerced elementwise # which is slow if freq: freq = Period._maybe_convert_freq(freq) if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, freq, fields) return cls._from_ordinals(data, name=name, freq=freq) if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq data = data._ndarray_values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) data = period.period_asfreq_arr(data._ndarray_values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) # not array / index if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if is_scalar(data) or isinstance(data, Period): cls._scalar_data_error(data) # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) data = np.asarray(data) # datetime other than period if is_datetime64_dtype(data.dtype): data = dt64arr_to_periodarr(data, freq, tz) return cls._from_ordinals(data, name=name, freq=freq) # check not floats if infer_dtype(data) == 'floating' and len(data) > 0: raise TypeError("PeriodIndex does not allow " "floating point in construction") # anything else, likely an array of strings or periods data = _ensure_object(data) freq = freq or period.extract_freq(data) data = period.extract_ordinals(data, freq) return cls._from_ordinals(data, name=name, freq=freq)
def __floordiv__(self, other): if is_scalar(other): if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._ndarray) return result # at this point we should only have numeric scalars; anything # else will raise result = self._ndarray // other freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other if freq.nanos == 0 and self.freq.nanos != 0: # e.g. if self.freq is Nano(1) then dividing by 2 # rounds down to zero freq = None return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.float64) np.putmask(result, mask, np.nan) return result elif is_object_dtype(other.dtype): # error: Incompatible types in assignment (expression has type # "List[Any]", variable has type "ndarray") srav = self.ravel() orav = other.ravel() res_list = [srav[n] // orav[n] for n in range(len(srav))] result_flat = np.asarray(res_list) inferred = lib.infer_dtype(result_flat, skipna=False) result = result_flat.reshape(self.shape) if inferred == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) if inferred == "datetime": # GH#39750 occurs when result is all-NaT, which in this # case should be interpreted as td64nat. This can only # occur when self is all-td64nat return self * np.nan return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._ndarray // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
def maybe_infer_to_datetimelike(value, convert_dates=False): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a datetime/timedelta set this is pretty strict in that a datetime/timedelta is REQUIRED in addition to possible nulls/string likes Parameters ---------- value : np.array / Series / Index / list-like convert_dates : boolean, default False if True try really hard to convert dates (such as datetime.date), other leave inferred dtype 'date' alone """ # TODO: why not timedelta? if isinstance( value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray)): return value elif isinstance(value, ABCSeries): if isinstance(value._values, ABCDatetimeIndex): return value._values v = value if not is_list_like(v): v = [v] v = np.array(v, copy=False) # we only care about object dtypes if not is_object_dtype(v): return value shape = v.shape if not v.ndim == 1: v = v.ravel() if not len(v): return value def try_datetime(v): # safe coerce to datetime64 try: # GH19671 v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0] except ValueError: # we might have a sequence of the same-datetimes with tz's # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 try: from pandas._libs.tslibs import conversion from pandas import DatetimeIndex values, tz = conversion.datetime_to_datetime64(v) return DatetimeIndex(values).tz_localize("UTC").tz_convert( tz=tz) except (ValueError, TypeError): pass except Exception: pass return v.reshape(shape) def try_timedelta(v): # safe coerce to timedelta64 # will try first with a string & object conversion from pandas import to_timedelta try: return to_timedelta(v)._ndarray_values.reshape(shape) except ValueError: return v.reshape(shape) inferred_type = lib.infer_datetimelike_array(ensure_object(v)) if inferred_type == "date" and convert_dates: value = try_datetime(v) elif inferred_type == "datetime": value = try_datetime(v) elif inferred_type == "timedelta": value = try_timedelta(v) elif inferred_type == "nat": # if all NaT, return as datetime if isna(v).all(): value = try_datetime(v) else: # We have at least a NaT and a string # try timedelta first to avoid spurious datetime conversions # e.g. '00:00:01' is a timedelta but technically is also a datetime value = try_timedelta(v) if lib.infer_dtype(value, skipna=False) in ["mixed"]: # cannot skip missing values, as NaT implies that the string # is actually a datetime value = try_datetime(v) return value
def array( data: Sequence[object] | AnyArrayLike, dtype: Dtype | None = None, copy: bool = True, ) -> ExtensionArray: """ Create an array. Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ======================================= Scalar Type Array Type ============================== ======================================= :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` :class:`str` :class:`pandas.arrays.StringArray` or :class:`pandas.arrays.ArrowStringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ======================================= The ExtensionArray created when the scalar type is :class:`str` is determined by ``pd.options.mode.string_storage`` if the dtype is not explicitly given. For all other cases, NumPy's usual inference rules will be used. .. versionchanged:: 1.0.0 Pandas infers nullable-integer dtype for integer data, string dtype for string data, and nullable-boolean dtype for boolean data. .. versionchanged:: 1.2.0 Pandas now also infers nullable-floating dtype for float-like input data copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, pandas will infer the best dtype from the values. See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) <IntegerArray> [1, 2] Length: 2, dtype: Int64 >>> pd.array([1, 2, np.nan]) <IntegerArray> [1, 2, <NA>] Length: 3, dtype: Int64 >>> pd.array([1.1, 2.2]) <FloatingArray> [1.1, 2.2] Length: 2, dtype: Float64 >>> pd.array(["a", None, "c"]) <StringArray> ['a', <NA>, 'c'] Length: 3, dtype: string >>> with pd.option_context("string_storage", "pyarrow"): ... arr = pd.array(["a", None, "c"]) ... >>> arr <ArrowStringArray> ['a', <NA>, 'c'] Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) ['a', 'b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. >>> pd.array([1 + 1j, 3 + 2j]) <PandasArray> [(1+1j), (3+2j)] Length: 2, dtype: complex128 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` as a NumPy dtype if you need to ensure there's no future change in behavior. >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( BooleanArray, DatetimeArray, ExtensionArray, FloatingArray, IntegerArray, IntervalArray, PandasArray, PeriodArray, TimedeltaArray, ) from pandas.core.arrays.string_ import StringDtype if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." raise ValueError(msg) if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)): # Note: we exclude np.ndarray here, will do type inference on it dtype = data.dtype data = extract_array(data, extract_numpy=True) # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = cast(ExtensionDtype, dtype).construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": return PeriodArray._from_sequence(data, copy=copy) elif inferred_dtype == "interval": return IntervalArray(data, copy=copy) elif inferred_dtype.startswith("datetime"): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith("timedelta"): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": # StringArray/ArrowStringArray depending on pd.options.mode.string_storage return StringDtype().construct_array_type()._from_sequence( data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) elif inferred_dtype in ("floating", "mixed-integer-float"): return FloatingArray._from_sequence(data, copy=copy) elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) return PandasArray._from_sequence(data, dtype=dtype, copy=copy)
def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'floating') # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([None, np.nan, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'mixed') # pd.NaT arr = np.array([pd.NaT]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([pd.NaT, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([np.nan, pd.NaT]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([np.nan, pd.NaT, np.nan]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([None, pd.NaT, None]) self.assertEqual(lib.infer_dtype(arr), 'datetime') # np.datetime64(nat) arr = np.array([np.datetime64('nat')]) self.assertEqual(lib.infer_dtype(arr), 'datetime64') for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.datetime64('nat'), n]) self.assertEqual(lib.infer_dtype(arr), 'datetime64') arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) self.assertEqual(lib.infer_dtype(arr), 'datetime64') arr = np.array([np.timedelta64('nat')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'timedelta') for n in [np.nan, pd.NaT, None]: arr = np.array([n, np.timedelta64('nat'), n]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) self.assertEqual(lib.infer_dtype(arr), 'timedelta') # datetime / timedelta mixed arr = np.array([pd.NaT, np.datetime64('nat'), np.timedelta64('nat'), np.nan]) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed')
def maybe_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 """ if is_scalar(result): return result def trans(x): return x if isinstance(dtype, string_types): if dtype == 'infer': inferred_type = lib.infer_dtype(ensure_object(result.ravel()), skipna=False) if inferred_type == 'boolean': dtype = 'bool' elif inferred_type == 'integer': dtype = 'int64' elif inferred_type == 'datetime64': dtype = 'datetime64[ns]' elif inferred_type == 'timedelta64': dtype = 'timedelta64[ns]' # try to upcast here elif inferred_type == 'floating': dtype = 'int64' if issubclass(result.dtype.type, np.number): def trans(x): # noqa return x.round() else: dtype = 'object' if isinstance(dtype, string_types): dtype = np.dtype(dtype) try: # don't allow upcasts here (except if empty) if dtype.kind == result.dtype.kind: if (result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape)): return result if is_bool_dtype(dtype) or is_integer_dtype(dtype): # if we don't have any elements, just astype it if not np.prod(result.shape): return trans(result).astype(dtype) # do a test on the first element, if it fails then we are done r = result.ravel() arr = np.array([r[0]]) # if we have any nulls, then we are done if (isna(arr).any() or not np.allclose(arr, trans(arr).astype(dtype), rtol=0)): return result # a comparable, e.g. a Decimal may slip in here elif not isinstance( r[0], (np.integer, np.floating, np.bool, int, float, bool)): return result if (issubclass(result.dtype.type, (np.object_, np.number)) and notna(result).all()): new_result = trans(result).astype(dtype) try: if np.allclose(new_result, result, rtol=0): return new_result except Exception: # comparison of an object dtype with a number type could # hit here if (new_result == result).all(): return new_result elif (issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype)): return result.astype(dtype) # a datetimelike # GH12821, iNaT is casted to float elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: try: result = result.astype(dtype) except Exception: if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime result = to_datetime(result).tz_localize('utc') result = result.tz_convert(dtype.tz) elif dtype.type == Period: # TODO(DatetimeArray): merge with previous elif from pandas.core.arrays import PeriodArray return PeriodArray(result, freq=dtype.freq) except Exception: pass return result
def test_infer_dtype_datetime(self): arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([np.datetime64('2011-01-01'), np.datetime64('2011-01-01')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'datetime64') arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) self.assertEqual(lib.infer_dtype(arr), 'datetime') # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Timestamp('2011-01-02')]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([n, np.datetime64('2011-01-02')]) self.assertEqual(lib.infer_dtype(arr), 'datetime64') arr = np.array([n, datetime(2011, 1, 1)]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([n, pd.Timestamp('2011-01-02'), n]) self.assertEqual(lib.infer_dtype(arr), 'datetime') arr = np.array([n, np.datetime64('2011-01-02'), n]) self.assertEqual(lib.infer_dtype(arr), 'datetime64') arr = np.array([n, datetime(2011, 1, 1), n]) self.assertEqual(lib.infer_dtype(arr), 'datetime') # different type of nat arr = np.array([np.timedelta64('nat'), np.datetime64('2011-01-02')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([np.datetime64('2011-01-02'), np.timedelta64('nat')], dtype=object) self.assertEqual(lib.infer_dtype(arr), 'mixed') # mixed datetime arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')]) self.assertEqual(lib.infer_dtype(arr), 'datetime') # should be datetime? arr = np.array([np.datetime64('2011-01-01'), pd.Timestamp('2011-01-02')]) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([pd.Timestamp('2011-01-02'), np.datetime64('2011-01-01')]) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) self.assertEqual(lib.infer_dtype(arr), 'mixed-integer') arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) self.assertEqual(lib.infer_dtype(arr), 'mixed') arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) self.assertEqual(lib.infer_dtype(arr), 'mixed')
def sanitize_array( data, index: Index | None, dtype: DtypeObj | None = None, copy: bool = False, raise_cast_failure: bool = True, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. Parameters ---------- data : Any index : Index or None, default None dtype : np.dtype, ExtensionDtype, or None, default None copy : bool, default False raise_cast_failure : bool, default True Returns ------- np.ndarray or ExtensionArray Notes ----- raise_cast_failure=False is only intended to be True when called from the DataFrame constructor, as the dtype keyword there may be interpreted as only applying to a subset of columns, see GH#24435. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) elif isinstance(data, range): # GH#16804 data = np.arange(data.start, data.stop, data.step, dtype="int64") copy = False if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr else: if isinstance(data, (set, frozenset)): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError(f"'{type(data).__name__}' type is unordered") # materialize e.g. generators, convert e.g. tuples, abc.ValueView # TODO: non-standard array-likes we can convert to ndarray more efficiently? data = list(data) if dtype is not None or len(data) == 0: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray, List[Any]]", variable has type # "ExtensionArray") subarr = maybe_cast_to_datetime(subarr, dtype) # type: ignore[assignment] subarr = _sanitize_ndim(subarr, data, dtype, index) if not ( isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype) ): subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) subarr = extract_array(subarr, extract_numpy=True) return subarr
def sanitize_array( data, index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: if isinstance(data, set): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError("Set type is unordered") data = list(data) if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): data = data[0] subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise ValueError("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype( dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def maybe_infer_to_datetimelike(value, convert_dates=False): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a datetime/timedelta set this is pretty strict in that a datetime/timedelta is REQUIRED in addition to possible nulls/string likes Parameters ---------- value : np.array / Series / Index / list-like convert_dates : boolean, default False if True try really hard to convert dates (such as datetime.date), other leave inferred dtype 'date' alone """ if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex)): return value elif isinstance(value, ABCSeries): if isinstance(value._values, ABCDatetimeIndex): return value._values v = value if not is_list_like(v): v = [v] v = np.array(v, copy=False) # we only care about object dtypes if not is_object_dtype(v): return value shape = v.shape if not v.ndim == 1: v = v.ravel() if not len(v): return value def try_datetime(v): # safe coerce to datetime64 try: v = tslib.array_to_datetime(v, errors='raise') except ValueError: # we might have a sequence of the same-datetimes with tz's # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype try: from pandas import to_datetime return to_datetime(v) except Exception: pass except Exception: pass return v.reshape(shape) def try_timedelta(v): # safe coerce to timedelta64 # will try first with a string & object conversion from pandas import to_timedelta try: return to_timedelta(v)._values.reshape(shape) except Exception: return v.reshape(shape) inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) if inferred_type == 'date' and convert_dates: value = try_datetime(v) elif inferred_type == 'datetime': value = try_datetime(v) elif inferred_type == 'timedelta': value = try_timedelta(v) elif inferred_type == 'nat': # if all NaT, return as datetime if isna(v).all(): value = try_datetime(v) else: # We have at least a NaT and a string # try timedelta first to avoid spurious datetime conversions # e.g. '00:00:01' is a timedelta but # technically is also a datetime value = try_timedelta(v) if lib.infer_dtype(value) in ['mixed']: value = try_datetime(v) return value
def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) assert result == 'integer' result = lib.infer_dtype([]) assert result == 'empty'
def __floordiv__(self, other): if is_scalar(other): if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other if freq.nanos == 0 and self.freq.nanos != 0: # e.g. if self.freq is Nano(1) then dividing by 2 # rounds down to zero freq = None return type(self)(result.view("m8[ns]"), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.float64) result[mask] = np.nan return result elif is_object_dtype(other.dtype): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None: subarr = np.array(data, copy=False) # possibility of nan -> garbage if is_float_dtype(data.dtype) and is_integer_dtype(dtype): try: subarr = _try_cast(data, True, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, Index): # don't coerce Index types # e.g. indexes can have different conversions (so don't fast path # them) # GH#6140 subarr = sanitize_index(data, index, copy=copy) else: # we will try to copy be-definition here subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): if isinstance(data, ABCPandasArray): # We don't want to let people put our PandasArray wrapper # (the output of Series/Index.array), into a Series. So # we explicitly unwrap it here. subarr = data.to_numpy() else: subarr = data # everything else in this block must also handle ndarray's, # because we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) if copy: subarr = data.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) except Exception: if raise_cast_failure: # pragma: no cover raise subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype='int64') subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') else: subarr = com.asarray_tuplesafe(data, dtype=dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if (not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)) and is_object_dtype(subarr.dtype) and not is_object_dtype(dtype)): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) assert s.dtype.name == 'datetime64[ns, US/Eastern]' assert s.dtype == 'datetime64[ns, US/Eastern]' assert is_datetime64tz_dtype(s.dtype) assert 'datetime64[ns, US/Eastern]' in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == 'datetime64[ns]' exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert 'datetime64[ns, US/Eastern]' in str(s) # formatting with NaT result = s.shift() assert 'datetime64[ns, US/Eastern]' in str(result) assert 'NaT' in str(result) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) assert 'datetime64[ns, US/Eastern]' in str(t) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific') ]) assert s.dtype == 'datetime64[ns, US/Pacific]' assert lib.infer_dtype(s, skipna=True) == 'datetime64' s = Series([ pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern') ]) assert s.dtype == 'object' assert lib.infer_dtype(s, skipna=True) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected)
def test_infer_dtype_timedelta(self): arr = np.array([pd.Timedelta('1 days'), pd.Timedelta('2 days')]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D')], dtype=object) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([timedelta(1), timedelta(2)]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, Timedelta('1 days')]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D')]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1)]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, pd.Timedelta('1 days'), n]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D'), n]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1), n]) assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # different type of nat arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed' arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], dtype=object) assert lib.infer_dtype(arr, skipna=False) == 'mixed'
def coerce_to_array(values, dtype=None, mask=None, copy: bool = False) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like dtype : float dtype mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is floating numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype if dtype is not None: if isinstance(dtype, str) and dtype.startswith("Float"): # Avoid DeprecationWarning from NumPy about np.dtype("Float64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), FloatingDtype): try: dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err if isinstance(values, FloatingArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in [ "floating", "integer", "mixed-integer", "integer-na", "mixed-integer-float", ]: raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") elif is_bool_dtype(values) and is_float_dtype(dtype): values = np.array(values, dtype=float, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") if mask is None: mask = isna(values) else: assert len(mask) == len(values) if not values.ndim == 1: raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype("float64") else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here # TODO should this be a safe cast? if mask.any(): values = values.copy() values[mask] = np.nan values = values.astype(dtype, copy=False) # , casting="safe") else: values = values.astype(dtype, copy=False) # , casting="safe") return values, mask
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if (not isinstance(values, np.ndarray) and not is_extension_array_dtype(values)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if (not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo(values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` new_labels = reverse_indexer.take(labels, mode='wrap') mask = labels == na_sentinel if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)
def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return self._ndarray / other elif lib.is_scalar(other): # assume it is numeric result = self._ndarray / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta freq = self.freq.delta / other return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other.dtype): # let numpy handle it return self._ndarray / other elif is_object_dtype(other.dtype): # We operate on raveled arrays to avoid problems in inference # on NaT srav = self.ravel() orav = other.ravel() result = [srav[n] / orav[n] for n in range(len(srav))] result = np.array(result).reshape(self.shape) # We need to do dtype inference in order to keep DataFrame ops # behavior consistent with Series behavior inferred = lib.infer_dtype(result, skipna=False) if inferred == "timedelta": flat = result.ravel() result = type(self)._from_sequence(flat).reshape(result.shape) elif inferred == "floating": result = result.astype(float) elif inferred == "datetime": # GH#39750 this occurs when result is all-NaT, in which case # we want to interpret these NaTs as td64. # We construct an all-td64NaT result. result = self * np.nan return result else: result = self._ndarray / other return type(self)(result)
def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) assert s.dtype.name == 'datetime64[ns, US/Eastern]' assert s.dtype == 'datetime64[ns, US/Eastern]' assert is_datetime64tz_dtype(s.dtype) assert 'datetime64[ns, US/Eastern]' in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == 'datetime64[ns]' exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert 'datetime64[ns, US/Eastern]' in str(s) # formatting with NaT result = s.shift() assert 'datetime64[ns, US/Eastern]' in str(result) assert 'NaT' in str(result) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) assert 'datetime64[ns, US/Eastern]' in str(t) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) assert s.dtype == 'datetime64[ns, US/Pacific]' assert lib.infer_dtype(s) == 'datetime64' s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) assert s.dtype == 'object' assert lib.infer_dtype(s) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected)
def __floordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented other = lib.item_from_zerodim(other) if is_scalar(other): if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other return type(self)(result.view('m8[ns]'), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.int64) result[mask] = np.nan return result elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == 'timedelta': result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other) or is_float_dtype(other): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError("Cannot divide {typ} by {cls}".format( typ=dtype, cls=type(self).__name__))