def test_datetime_with_timezone(self): # GH 15426 ts = Timestamp("2016-01-01 12:00:00", tz='US/Pacific') exp = DatetimeIndex([ts, ts]) res = maybe_downcast_to_dtype(exp, exp.dtype) tm.assert_index_equal(res, exp) res = maybe_downcast_to_dtype(exp.asi8, exp.dtype) tm.assert_index_equal(res, exp)
def test_datetimelikes_nan(self): arr = np.array([1, 2, np.nan]) exp = np.array([1, 2, np.datetime64('NaT')], dtype='datetime64[ns]') res = maybe_downcast_to_dtype(arr, 'datetime64[ns]') tm.assert_numpy_array_equal(res, exp) exp = np.array([1, 2, np.timedelta64('NaT')], dtype='timedelta64[ns]') res = maybe_downcast_to_dtype(arr, 'timedelta64[ns]') tm.assert_numpy_array_equal(res, exp)
def test_datetimelikes_nan(self): arr = np.array([1, 2, np.nan]) exp = np.array([1, 2, np.datetime64('NaT')], dtype='datetime64[ns]') res = maybe_downcast_to_dtype(arr, 'datetime64[ns]') tm.assert_numpy_array_equal(res, exp) exp = np.array([1, 2, np.timedelta64('NaT')], dtype='timedelta64[ns]') res = maybe_downcast_to_dtype(arr, 'timedelta64[ns]') tm.assert_numpy_array_equal(res, exp)
def test_datetime_with_timezone(self): # GH 15426 ts = Timestamp("2016-01-01 12:00:00", tz='US/Pacific') exp = DatetimeIndex([ts, ts]) res = maybe_downcast_to_dtype(exp, exp.dtype) tm.assert_index_equal(res, exp) res = maybe_downcast_to_dtype(exp.asi8, exp.dtype) tm.assert_index_equal(res, exp)
def test_downcast_booleans(): # see gh-16875: coercing of booleans. ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected)
def test_downcast_conversion_no_nan(any_real_dtype): dtype = any_real_dtype expected = np.array([1, 2]) arr = np.array([1.0, 2.0], dtype=dtype) result = maybe_downcast_to_dtype(arr, "infer") tm.assert_almost_equal(result, expected, check_dtype=False)
def test_downcast_conversion_no_nan(any_real_dtype): dtype = any_real_dtype expected = np.array([1, 2]) arr = np.array([1.0, 2.0], dtype=dtype) result = maybe_downcast_to_dtype(arr, "infer") tm.assert_almost_equal(result, expected, check_dtype=False)
def test_datetime_likes_nan(klass): dtype = klass.__name__ + "[ns]" arr = np.array([1, 2, np.nan]) exp = np.array([1, 2, klass("NaT")], dtype) res = maybe_downcast_to_dtype(arr, dtype) tm.assert_numpy_array_equal(res, exp)
def test_downcast_booleans(): # see gh-16875: coercing of booleans. ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected)
def test_datetime_likes_nan(klass): dtype = klass.__name__ + "[ns]" arr = np.array([1, 2, np.nan]) exp = np.array([1, 2, klass("NaT")], dtype) res = maybe_downcast_to_dtype(arr, dtype) tm.assert_numpy_array_equal(res, exp)
def test_datetime_with_timezone(as_asi): # see gh-15426 ts = Timestamp("2016-01-01 12:00:00", tz="US/Pacific") exp = DatetimeIndex([ts, ts]) obj = exp.asi8 if as_asi else exp res = maybe_downcast_to_dtype(obj, exp.dtype) tm.assert_index_equal(res, exp)
def test_datetime_with_timezone(as_asi): # see gh-15426 ts = Timestamp("2016-01-01 12:00:00", tz="US/Pacific") exp = DatetimeIndex([ts, ts]) obj = exp.asi8 if as_asi else exp res = maybe_downcast_to_dtype(obj, exp.dtype) tm.assert_index_equal(res, exp)
def test_downcast_conversion_nan(float_dtype): dtype = float_dtype data = [1.0, 2.0, np.nan] expected = np.array(data, dtype=dtype) arr = np.array(data, dtype=dtype) result = maybe_downcast_to_dtype(arr, "infer") tm.assert_almost_equal(result, expected)
def test_downcast_conversion_nan(float_dtype): dtype = float_dtype data = [1.0, 2.0, np.nan] expected = np.array(data, dtype=dtype) arr = np.array(data, dtype=dtype) result = maybe_downcast_to_dtype(arr, "infer") tm.assert_almost_equal(result, expected)
def test_downcast(self): # test downcasting arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # see gh-16875: coercing of booleans. ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected)
def test_downcast(self): # test downcasting arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # see gh-16875: coercing of booleans. ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected)
def test_downcast_conv(self): # test downcasting arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # GH16875 coercing of bools ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected) # conversions expected = np.array([1, 2]) for dtype in [np.float64, object, np.int64]: arr = np.array([1.0, 2.0], dtype=dtype) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_almost_equal(result, expected, check_dtype=False) for dtype in [np.float64, object]: expected = np.array([1.0, 2.0, np.nan], dtype=dtype) arr = np.array([1.0, 2.0, np.nan], dtype=dtype) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_almost_equal(result, expected) # empties for dtype in [ np.int32, np.float64, np.float32, np.bool_, np.int64, object ]: arr = np.array([], dtype=dtype) result = maybe_downcast_to_dtype(arr, 'int64') tm.assert_almost_equal(result, np.array([], dtype=np.int64)) assert result.dtype == np.int64
def test_downcast_conv(self): # test downcasting arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # GH16875 coercing of bools ser = Series([True, True, False]) result = maybe_downcast_to_dtype(ser, np.dtype(np.float64)) expected = ser tm.assert_series_equal(result, expected) # conversions expected = np.array([1, 2]) for dtype in [np.float64, object, np.int64]: arr = np.array([1.0, 2.0], dtype=dtype) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_almost_equal(result, expected, check_dtype=False) for dtype in [np.float64, object]: expected = np.array([1.0, 2.0, np.nan], dtype=dtype) arr = np.array([1.0, 2.0, np.nan], dtype=dtype) result = maybe_downcast_to_dtype(arr, 'infer') tm.assert_almost_equal(result, expected) # empties for dtype in [np.int32, np.float64, np.float32, np.bool_, np.int64, object]: arr = np.array([], dtype=dtype) result = maybe_downcast_to_dtype(arr, 'int64') tm.assert_almost_equal(result, np.array([], dtype=np.int64)) assert result.dtype == np.int64
def test_downcast(arr, expected, dtype): result = maybe_downcast_to_dtype(arr, dtype) tm.assert_numpy_array_equal(result, expected)
def test_downcast_conversion_nan(self, dtype): expected = np.array([1.0, 2.0, np.nan], dtype=dtype) arr = np.array([1.0, 2.0, np.nan], dtype=dtype) result = maybe_downcast_to_dtype(arr, "infer") tm.assert_almost_equal(result, expected)
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Please note that precision loss may occur if really large numbers are passed in. Due to the internal limitations of `ndarray`, if numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are passed in, it is very likely they will be converted to float so that they can stored in an `ndarray`. These warnings apply similarly to `Series` since it internally leverages `ndarray`. Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. See Also -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = errors not in ('ignore', 'raise') values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize <= values.dtype.itemsize: values = maybe_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def test_downcast_conversion_empty(any_real_dtype): dtype = any_real_dtype arr = np.array([], dtype=dtype) result = maybe_downcast_to_dtype(arr, "int64") tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64))
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None, result_mask: npt.NDArray[np.bool_] | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.view("uint8") if values.dtype == "float16": values = values.astype(np.float32) values = values.T if mask is not None: mask = mask.T if result_mask is not None: result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) values = self._get_cython_vals(values) out_dtype = self._get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max", "mean", "last", "first"]: func( out=result, counts=counts, values=values, labels=comp_ids, min_count=min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, ) elif self.how in ["sum"]: # We support datetimelike func( out=result, counts=counts, values=values, labels=comp_ids, min_count=min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): if self.how != "rank": # TODO: should rank take result_mask? kwargs["result_mask"] = result_mask func( out=result, values=values, labels=comp_ids, ngroups=ngroups, is_datetimelike=is_datetimelike, mask=mask, **kwargs, ) else: func( out=result, values=values, labels=comp_ids, ngroups=ngroups, is_datetimelike=is_datetimelike, **kwargs, ) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): if result_mask is not None and self.uses_mask(): assert result_mask[empty_groups].all() else: # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here # Casting only needed for float16, bool, datetimelike, # and self.how in ["sum", "prod", "ohlc", "cumprod"] res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result return op_result
def test_downcast(arr, expected, dtype): result = maybe_downcast_to_dtype(arr, dtype) tm.assert_numpy_array_equal(result, expected)
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(getattr(func, '__name__', func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if (v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: from pandas import MultiIndex if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(getattr(func, '__name__', func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) # group by the cartesian product of the grouper # if we have a categorical grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in [v for v in values if v in data and v in agged]: if (is_integer_dtype(data[v]) and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: from pandas import MultiIndex if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: np.ndarray | None = None, **kwargs, ) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) func_uses_mask = cy_op.uses_mask() if is_extension_array_dtype(dtype): if isinstance(values, BaseMaskedArray) and func_uses_mask: return self._masked_ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) else: return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, mask=mask, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype = dtype.kind + "8" values = values.astype(dtype, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count if func_uses_mask: func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan if self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in cy_op.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def __internal_pivot_table( data: DataFrame, values, index, columns, aggfunc: AggFuncTypeBase | AggFuncTypeDict, fill_value, margins: bool, dropna: bool, margins_name: str, observed: bool, sort: bool, ) -> DataFrame: """ Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``. """ keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed, sort=sort) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if ( v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v]) ): if not isinstance(agged[v], ABCDataFrame): # exclude DataFrame case bc maybe_downcast_to_dtype expects # ArrayLike # e.g. test_pivot_table_multiindex_columns_doctest_case # agged.columns is a MultiIndex and 'v' is indexing only # on its first level. agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[: len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if isinstance(table.index, MultiIndex): m = MultiIndex.from_arrays( cartesian_product(table.index.levels), names=table.index.names ) table = table.reindex(m, axis=0) if isinstance(table.columns, MultiIndex): m = MultiIndex.from_arrays( cartesian_product(table.columns.levels), names=table.columns.names ) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(fill_value, downcast="infer") if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if values_passed and not values_multi and table.columns.nlevels > 1: table = table.droplevel(0, axis=1) if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(dtype): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if swapped: result = result.swapaxes(0, axis) if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def test_downcast_conversion_nan(self, dtype): expected = np.array([1.0, 2.0, np.nan], dtype=dtype) arr = np.array([1.0, 2.0, np.nan], dtype=dtype) result = maybe_downcast_to_dtype(arr, "infer") tm.assert_almost_equal(result, expected)
def interval_range(start=None, end=None, periods=None, freq=None, name=None, closed='right'): """ Return a fixed frequency IntervalIndex Parameters ---------- start : numeric or datetime-like, default None Left bound for generating intervals end : numeric or datetime-like, default None Right bound for generating intervals periods : integer, default None Number of periods to generate freq : numeric, string, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. name : string, default None Name of the resulting IntervalIndex closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. Returns ------- rng : IntervalIndex See Also -------- IntervalIndex : An Index of intervals that are all closed on the same side. Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, exactly three must be specified. If ``freq`` is omitted, the resulting ``IntervalIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__. Examples -------- Numeric ``start`` and ``end`` is supported. >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], closed='right', dtype='interval[int64]') Additionally, datetime-like input is also supported. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]], closed='right', dtype='interval[datetime64[ns]]') The ``freq`` parameter specifies the frequency between the left and right. endpoints of the individual intervals within the ``IntervalIndex``. For numeric ``start`` and ``end``, the frequency must also be numeric. >>> pd.interval_range(start=0, periods=4, freq=1.5) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], (2017-03-01, 2017-04-01]], closed='right', dtype='interval[datetime64[ns]]') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). >>> pd.interval_range(start=0, end=6, periods=4) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ start = com.maybe_box_datetimelike(start) end = com.maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com._any_none(periods, start, end): freq = 1 if is_number(endpoint) else 'D' if com.count_not_none(start, end, periods, freq) != 3: raise ValueError('Of the four parameters: start, end, periods, and ' 'freq, exactly three must be specified') if not _is_valid_endpoint(start): msg = 'start must be numeric or datetime-like, got {start}' raise ValueError(msg.format(start=start)) elif not _is_valid_endpoint(end): msg = 'end must be numeric or datetime-like, got {end}' raise ValueError(msg.format(end=end)) if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError: raise ValueError('freq must be numeric or convertible to ' 'DateOffset, got {freq}'.format(freq=freq)) # verify type compatibility if not all([_is_type_compatible(start, end), _is_type_compatible(start, freq), _is_type_compatible(end, freq)]): raise TypeError("start, end, freq need to be type compatible") # +1 to convert interval count to breaks count (n breaks = n-1 intervals) if periods is not None: periods += 1 if is_number(endpoint): # force consistency between start/end/freq (lower end if freq skips it) if com._all_not_none(start, end, freq): end -= (end - start) % freq # compute the period/start/end if unspecified (at most one) if periods is None: periods = int((end - start) // freq) + 1 elif start is None: start = end - (periods - 1) * freq elif end is None: end = start + (periods - 1) * freq breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com._not_none(start, end, freq)): # np.linspace always produces float output breaks = maybe_downcast_to_dtype(breaks, 'int64') else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): range_func = date_range else: range_func = timedelta_range breaks = range_func(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def test_downcast_conversion_empty(any_real_dtype): dtype = any_real_dtype arr = np.array([], dtype=dtype) result = maybe_downcast_to_dtype(arr, "int64") tm.assert_numpy_array_equal(result, np.array([], dtype=np.int64))
def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ Return a fixed frequency IntervalIndex. Parameters ---------- start : numeric or datetime-like, default None Left bound for generating intervals. end : numeric or datetime-like, default None Right bound for generating intervals. periods : int, default None Number of periods to generate. freq : numeric, str, or DateOffset, default None The length of each interval. Must be consistent with the type of start and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1 for numeric and 'D' for datetime-like. name : str, default None Name of the resulting IntervalIndex. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. Returns ------- IntervalIndex See Also -------- IntervalIndex : An Index of intervals that are all closed on the same side. Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, exactly three must be specified. If ``freq`` is omitted, the resulting ``IntervalIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__. Examples -------- Numeric ``start`` and ``end`` is supported. >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], closed='right', dtype='interval[int64]') Additionally, datetime-like input is also supported. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... end=pd.Timestamp('2017-01-04')) IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]], closed='right', dtype='interval[datetime64[ns]]') The ``freq`` parameter specifies the frequency between the left and right. endpoints of the individual intervals within the ``IntervalIndex``. For numeric ``start`` and ``end``, the frequency must also be numeric. >>> pd.interval_range(start=0, periods=4, freq=1.5) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. >>> pd.interval_range(start=pd.Timestamp('2017-01-01'), ... periods=3, freq='MS') IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], (2017-03-01, 2017-04-01]], closed='right', dtype='interval[datetime64[ns]]') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). >>> pd.interval_range(start=0, end=6, periods=4) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', dtype='interval[float64]') The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ start = com.maybe_box_datetimelike(start) end = com.maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com.any_none(periods, start, end): freq = 1 if is_number(endpoint) else "D" if com.count_not_none(start, end, periods, freq) != 3: raise ValueError( "Of the four parameters: start, end, periods, and " "freq, exactly three must be specified" ) if not _is_valid_endpoint(start): raise ValueError(f"start must be numeric or datetime-like, got {start}") elif not _is_valid_endpoint(end): raise ValueError(f"end must be numeric or datetime-like, got {end}") if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: raise TypeError(f"periods must be a number, got {periods}") if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError as err: raise ValueError( f"freq must be numeric or convertible to DateOffset, got {freq}" ) from err # verify type compatibility if not all( [ _is_type_compatible(start, end), _is_type_compatible(start, freq), _is_type_compatible(end, freq), ] ): raise TypeError("start, end, freq need to be type compatible") # +1 to convert interval count to breaks count (n breaks = n-1 intervals) if periods is not None: periods += 1 if is_number(endpoint): # force consistency between start/end/freq (lower end if freq skips it) if com.all_not_none(start, end, freq): end -= (end - start) % freq # compute the period/start/end if unspecified (at most one) if periods is None: periods = int((end - start) // freq) + 1 elif start is None: start = end - (periods - 1) * freq elif end is None: end = start + (periods - 1) * freq breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output breaks = maybe_downcast_to_dtype(breaks, "int64") else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): range_func = date_range else: range_func = timedelta_range breaks = range_func(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 See Also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_datetime : Convert argument to datetime. pandas.to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize <= values.dtype.itemsize: values = maybe_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # GH#43329 If the dtype is explicitly of type uint64 the type is not # changed to prevent overflow. if dtype != np.uint64: values = values.astype(np.int64, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.T if result_mask is not None: result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max", "mean"]: func( result, counts, values, comp_ids, min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, ) elif self.how in ["add"]: # We support datetimelike func( result, counts, values, comp_ids, min_count, datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype_str = dtype.kind + "8" values = values.astype(dtype_str, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = self.get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self.get_result_dtype(orig_values.dtype) # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]" op_result = maybe_downcast_to_dtype( result, res_dtype # type: ignore[arg-type] ) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]