def _assign_feature_type(feature_type, unique_count=0): if is_string_dtype(feature_type) or ( is_numeric_dtype(feature_type) and unique_count <= 2 ): return "categorical" elif is_numeric_dtype(feature_type): return "continuous" else: return "unknown"
def test_is_numeric_dtype(): assert not com.is_numeric_dtype(str) assert not com.is_numeric_dtype(np.datetime64) assert not com.is_numeric_dtype(np.timedelta64) assert not com.is_numeric_dtype(np.array(['a', 'b'])) assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64)) assert com.is_numeric_dtype(int) assert com.is_numeric_dtype(float) assert com.is_numeric_dtype(np.uint64) assert com.is_numeric_dtype(pd.Series([1, 2])) assert com.is_numeric_dtype(pd.Index([1, 2.]))
def _na_for_min_count(values, axis): """Return the missing value for `values` Parameters ---------- values : ndarray axis : int or None axis for the reduction Returns ------- result : scalar or ndarray For 1-D values, returns a scalar of the correct missing type. For 2-D values, returns a 1-D array where each element is missing. """ # we either return np.nan or pd.NaT if is_numeric_dtype(values): values = values.astype('float64') fill_value = na_value_for_dtype(values.dtype) if values.ndim == 1: return fill_value else: result_shape = (values.shape[:axis] + values.shape[axis + 1:]) result = np.empty(result_shape, dtype=values.dtype) result.fill(fill_value) return result
def autogen_schema(X, ordinal_max_items=2, feature_names=None, feature_types=None): """ Generates data schema for a given dataset as JSON representable. Args: X: Dataframe/ndarray to build schema from. ordinal_max_items: If a numeric column's cardinality is at most this integer, consider it as ordinal instead of continuous. feature_names: Feature names feature_types: Feature types Returns: A dictionary - schema that encapsulates column information, such as type and domain. """ schema = OrderedDict() col_number = 0 if isinstance(X, np.ndarray): log.warning( "Passing a numpy array to schema autogen when it should be dataframe." ) if feature_names is None: X = pd.DataFrame( X, columns=["col_" + str(i) for i in range(X.shape[1])] ).infer_objects() else: X = pd.DataFrame(X, columns=feature_names).infer_objects() if isinstance(X, NDFrame): for name, col_dtype in zip(X.dtypes.index, X.dtypes): schema[name] = {} if is_numeric_dtype(col_dtype): # schema[name]['type'] = 'continuous' # TODO: Fix this once we know it works. if len(set(X[name])) > ordinal_max_items: schema[name]["type"] = "continuous" else: # TODO: Work with ordinal later. schema[name]["type"] = "categorical" # schema[name]['type'] = 'ordinal' # schema[name]['order'] = list(set(X[name])) elif is_string_dtype(col_dtype): schema[name]["type"] = "categorical" else: warnings.warn("Unknown column: " + name, RuntimeWarning) schema[name]["type"] = "unknown" schema[name]["column_number"] = col_number col_number += 1 # Override if feature_types is passed as arg. if feature_types is not None: for idx, name in enumerate(X.dtypes.index): schema[name]["type"] = feature_types[idx] else: raise TypeError("GA2M only supports numpy arrays or pandas dataframes.") return schema
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def f(values, axis=None, skipna=True, **kwds): if len(self.kwargs) > 0: for k, v in compat.iteritems(self.kwargs): if k not in kwds: kwds[k] = v try: if values.size == 0: # we either return np.nan or pd.NaT if is_numeric_dtype(values): values = values.astype('float64') fill_value = na_value_for_dtype(values.dtype) if values.ndim == 1: return fill_value else: result_shape = (values.shape[:axis] + values.shape[axis + 1:]) result = np.empty(result_shape, dtype=values.dtype) result.fill(fill_value) return result if (_USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name)): result = bn_func(values, axis=axis, **kwds) # prefer to treat inf/-inf as NA, but must compute the func # twice :( if _has_infs(result): result = alt(values, axis=axis, skipna=skipna, **kwds) else: result = alt(values, axis=axis, skipna=skipna, **kwds) except Exception: try: result = alt(values, axis=axis, skipna=skipna, **kwds) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(values): raise TypeError(e) raise return result
def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): result = result.astype('c16') else: result = result.astype('f8') result[null_mask] = np.nan else: # GH12941, use None to auto cast null result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() if null_mask < min_count: result = np.nan return result
def test_apply_custom_column_types(self): data = { 'to_numerical': [ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10, 'to_text': [ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10, 'to_categorical': [150, 200, 50, 10, 5, 150, 200, 50, 10, 5, 1] * 10, 'stay_categorical': [ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10 } df1 = df2 = pd.DataFrame.from_dict(data) custom_column_types = { 'to_numerical': ColumnType.numerical, 'to_text': ColumnType.text, 'to_categorical': ColumnType.categorical } store = Store(df1, df2, custom_column_types=custom_column_types) with self.subTest("Apply custom_column_types"): self.assertEqual(['to_categorical', 'stay_categorical'], store.type_to_columns[ColumnType.categorical]) self.assertEqual(['to_text'], store.type_to_columns[ColumnType.text]) self.assertEqual(['to_numerical'], store.type_to_columns[ColumnType.numerical]) with self.subTest( "Apply numerical conversion for custom_column_types to dataframes" ): self.assertTrue(is_numeric_dtype(store.df1['to_numerical'])) self.assertTrue(store.df1['to_numerical'].equals( pd.Series([ 150.0, 200.0, 50.0, 10.0, 5.0, 150.0, 200.0, 50.0, 10.0, 5.0, 1.0 ] * 10))) with self.subTest( "Apply categorical conversion for custom_column_types to dataframes" ): self.assertTrue(is_string_dtype(store.df1['to_categorical'])) self.assertTrue(store.df1['to_categorical'].equals( pd.Series([ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10))) with self.subTest( "Apply textual conversion for custom_column_types to dataframes" ): self.assertTrue(is_string_dtype(store.df1['to_text'])) self.assertTrue(store.df1['to_text'].equals( pd.Series([ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10)))
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetime64tz_dtype(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = 'object' elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype = dtype.kind + "8" values = values.astype(dtype, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan if self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64), np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): upcast_cls = "object" elif is_datetime64_dtype(dtype): upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.dtype(np.float64), np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def is_numeric_mixed_type(self) -> bool: return all(is_numeric_dtype(t) for t in self.get_dtypes())
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas import testing as tm >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if (isinstance(left.dtype, CategoricalDtype) and isinstance(right.dtype, CategoricalDtype) and not check_categorical): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype( right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric if isinstance(left_values, ExtensionArray) and isinstance( right_values, ExtensionArray): assert_extension_array_equal( left_values, right_values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: assert_numpy_array_equal( left_values, right_values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and (needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = (f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}.") raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif isinstance(left.dtype, CategoricalDtype) or isinstance( right.dtype, CategoricalDtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype( right.dtype): assert_extension_array_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype) or is_extension_array_dtype_and_needs_i8_conversion( right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if isinstance(left.dtype, CategoricalDtype) or isinstance( right.dtype, CategoricalDtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64) if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype return empty_dtype has_none_blocks = any(unit.block is None for unit in join_units) dtypes = [ None if unit.block is None else unit.dtype for unit in join_units ] filtered_dtypes = [ unit.dtype for unit in join_units if unit.block is not None and not unit.is_na ] if not len(filtered_dtypes): filtered_dtypes = [ unit.dtype for unit in join_units if unit.block is not None ] dtype_alt = find_common_type(filtered_dtypes) upcast_classes = _get_upcast_classes(join_units, dtypes) if is_extension_array_dtype(dtype_alt): return dtype_alt elif dtype_alt == object: return dtype_alt # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: return np.dtype("object") elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_) else: return np.dtype(np.bool_) elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0] elif "datetime" in upcast_classes: return np.dtype("M8[ns]") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]") else: try: common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_) else: if is_float_dtype(common_dtype): return common_dtype elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64) else: return common_dtype msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: # If we ever have BoolIndex or ComplexIndex, this may need to be tightened return is_numeric_dtype(dtype)
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, mask: np.ndarray | None = None, **kwargs, ) -> ArrayLike: """ Returns the values of a cython operation. """ assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) comp_ids, _, _ = self.group_info ngroups = self.ngroups func_uses_mask = cy_op.uses_mask() if is_extension_array_dtype(dtype): if isinstance(values, BaseMaskedArray) and func_uses_mask: return cy_op._masked_ea_wrap_cython_operation( values, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, **kwargs, ) else: return cy_op._ea_wrap_cython_operation( values, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, **kwargs, ) return cy_op._cython_op_ndim_compat( values, min_count=min_count, ngroups=self.ngroups, comp_ids=comp_ids, mask=mask, **kwargs, )
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype_str = dtype.kind + "8" values = values.astype(dtype_str, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.reshape(values.shape, order="C") out_shape = self.get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self.get_result_dtype(orig_values.dtype) # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]" op_result = maybe_downcast_to_dtype( result, res_dtype # type: ignore[arg-type] ) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]
def plot_all(self, row=None, cols=None, xlabel="", ylabel="", legend="", cont_colors=False, plot_filter_pos="", plot_filter_neg="", save=False): figr = plt.figure() if row is None: row = self.x if cols is None: cols = self.ys good_cols = [] text_cols = [] for col in cols: if plot_filter_neg and col.startswith(plot_filter_neg): continue if plot_filter_pos and not col.startswith(plot_filter_pos): continue if is_numeric_dtype(self.dataframe[col]): good_cols.append(col) else: try: if "+-" in self.dataframe[col][0]: good_cols.append(col) except: pass for col in cols: if (col not in good_cols): try: if isinstance(self.dataframe[col][0], list) and None in self.dataframe[col][0]: continue if is_numeric_dtype(self.dataframe[col] ) or "+-" in self.dataframe[col][0]: continue text_cols.append(col) except: pass self.dataframe = self.dataframe.sort_values(by=text_cols, ascending=True) sidex = sidey = int(np.sqrt(len(good_cols) + 1)) while sidex * sidey < len(good_cols) + 1: # sidex += int((len(good_cols) - sidex ** 2) / sidex) + 1 sidex += 1 fig, axs = plt.subplots(sidex, sidey) # fig.subplots_adjust(bottom=0.2) fig.set_size_inches(25, 14) axes = axs.ravel() if cont_colors: # len(self.dataframe[col]) >= 15: plt.rcParams["axes.prop_cycle"] = plt.cycler( "color", plt.cm.viridis(np.linspace(0, 1, len(self.dataframe[col])))) for i, col in enumerate(good_cols): ax = axes[i] leg = legend if legend else col col = self.dataframe[col] if is_numeric_dtype(col): self.normal_plot(col, row, ax, xlabel=xlabel, ylabel=ylabel, legend=leg) else: try: self.conf_plot(col, row, ax, xlabel=xlabel, ylabel=ylabel, legend=leg) except: pass while i < len(axes) - 1: i += 1 axes[i].set_axis_off() leg = self.global_legend(text_cols, fig, axes[len(good_cols)]) plt.gcf().tight_layout() if save: fig.savefig('./img/samplefigure', bbox_extra_artists=[leg], bbox_inches='tight') plt.show()
def _cython_operation(self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations") if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def convert_dtypes( input_array, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, to types supporting ``pd.NA``. Parameters ---------- input_array : ExtensionArray or PandasArray convert_string : bool, default True Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. Returns ------- dtype new dtype """ if convert_string or convert_integer or convert_boolean: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: # Required to catch due to Period. Can remove once GH 23553 is fixed inferred_dtype = input_array.dtype if not convert_string and is_string_dtype(inferred_dtype): inferred_dtype = input_array.dtype if convert_integer: target_int_dtype = "Int64" if isinstance(inferred_dtype, str) and (inferred_dtype == "mixed-integer" or inferred_dtype == "mixed-integer-float"): inferred_dtype = target_int_dtype if is_integer_dtype( input_array.dtype) and not is_extension_array_dtype( input_array.dtype): from pandas.core.arrays.integer import _dtypes inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( input_array.dtype): inferred_dtype = target_int_dtype else: if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype if convert_boolean: if is_bool_dtype( input_array.dtype) and not is_extension_array_dtype( input_array.dtype): inferred_dtype = "boolean" else: if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": inferred_dtype = input_array.dtype else: inferred_dtype = input_array.dtype return inferred_dtype
def _get_empty_dtype_and_na( join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64), np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = _get_upcast_classes(join_units, dtypes) # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: if len(upcast_classes) == 1: cls = upcast_classes["extension"][0] return cls, cls.na_value else: return np.dtype("object"), np.nan elif "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: try: common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(common_dtype): return common_dtype, common_dtype.type(np.nan) elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64), np.nan else: return common_dtype, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> np.ndarray: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(dtype): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if swapped: result = result.swapaxes(0, axis) if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray]", variable has type "ndarray") result = maybe_downcast_to_dtype(result, dtype) # type: ignore[assignment] return result
def _can_union_without_object_cast(self, other) -> bool: # See GH#26778, further casting may occur in NumericIndex._union return is_numeric_dtype(other.dtype)
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetime64tz_dtype(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # GH#43329 If the dtype is explicitly of type uint64 the type is not # changed to prevent overflow. if dtype != np.uint64: values = values.astype(np.int64, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.T if result_mask is not None: result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max", "mean"]: func( result, counts, values, comp_ids, min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, ) elif self.how in ["add"]: # We support datetimelike func( result, counts, values, comp_ids, min_count, datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]
def normalize_vocabularies(dfs, vocabularies=None, train_vocabularies=True, unk=None, verbose=0): """ Categorize the columns of the dataframes so that they share the same categories if they share the same columns If a column's name ends up with '_id', do not categorize it since it is no something we want to train on Parameters ---------- dfs: list of pd.DataFrame DataFrame whose columns will be categorized vocabularies: dict or None Existing vocabulary to use if any train_vocabularies: bool or dict of (str, bool) Which category to extend/create in the voc ? unk: dict of (str, any) Which filler should we put for an unknown object if we cannot train the corresponding voc ? verbose: int Returns ------- list of pd.DataFrame, dict """ # Define label vocabulary if unk is None: unk = {} if vocabularies is None: vocabularies = {} voc_order = list(vocabularies.keys()) if train_vocabularies is False: train_vocabularies = defaultdict(lambda: False) else: train_vocabularies_ = defaultdict(lambda: True) if isinstance(train_vocabularies, dict): train_vocabularies_.update(train_vocabularies) train_vocabularies = train_vocabularies_ del train_vocabularies_ for col_name in vocabularies: if col_name not in train_vocabularies: train_vocabularies[col_name] = False for df in dfs: for col_name in df: if not col_name.endswith('_id') and not is_numeric_dtype( df[col_name].dtype): if train_vocabularies[col_name]: train_vocabularies[col_name] = True else: train_vocabularies[col_name] = False for col_name, will_train in train_vocabularies.items(): if will_train and verbose: print(f"Will train vocabulary for {col_name}") for df in dfs: for col_name in df: if hasattr( df[col_name], 'cat' ) and col_name not in vocabularies and not col_name.endswith( '_id'): if verbose: print( f"Discovered existing vocabulary ({len(df[col_name].cat.categories)} entities) for {col_name}" ) vocabularies[col_name] = list(df[col_name].dtype.categories) for voc_name, train_voc in train_vocabularies.items(): if train_voc: voc = list(vocabularies.get(voc_name, [])) if voc_name in unk and unk[voc_name] not in voc: voc.append(unk[voc_name]) if hasattr(voc, 'categories'): voc = list(voc.categories) for df in dfs: if voc_name in df: voc.extend(df[voc_name].astype("category").cat.categories) voc = pd.factorize(voc)[1] dtype = pd.CategoricalDtype(pd.factorize(voc)[1]) for df in dfs: if voc_name in df: df[voc_name] = df[voc_name].astype(dtype) vocabularies[voc_name] = voc if voc_name in unk: df[voc_name].fillna(unk[voc_name], inplace=True) else: voc = vocabularies.get(voc_name) if not hasattr(voc, 'categories'): voc = pd.CategoricalDtype(voc) for df in dfs: if voc_name in df: df[voc_name] = df[voc_name].astype(voc) if verbose: unk_msg = f"unk {unk[voc_name]}" if voc_name in unk else "no unk" print( f"Normalized {voc_name}, with given vocabulary and {unk_msg}" ) if voc_name in unk: df[voc_name].fillna(unk[voc_name], inplace=True) # Reorder vocabularies to keep same order as the vocabulary passed in parameters vocabularies = dict( (*((c, vocabularies[c]) for c in voc_order if c in vocabularies), *((c, vocabularies[c]) for c in vocabularies if c not in voc_order))) # Reorder dataframes according to vocabulary order dfs = [ df[[ *(c for c in vocabularies if c in df.columns), *(c for c in df.columns if c not in vocabularies) ]] for df in dfs ] return dfs, vocabularies
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count, ) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. Adjust input argument to the specified origin Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be adjusted origin : 'julian' or Timestamp origin offset for the arg unit : string passed unit from to_datetime, must be 'D' Returns ------- ndarray or scalar of adjusted date(s) """ if origin == "julian": original = arg j0 = Timestamp(0).to_julian_date() if unit != "D": raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except TypeError: raise ValueError( "incompatible 'arg' type for given 'origin'='julian'") # preemptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( f"{original} is Out of Bounds for origin='julian'") else: # arg must be numeric if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( f"'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified") # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: raise tslibs.OutOfBoundsDatetime( f"origin {origin} is Out of Bounds") except ValueError: raise ValueError( f"origin {origin} cannot be converted to a Timestamp") if offset.tz is not None: raise ValueError(f"origin offset {offset} must be tz-naive") offset -= Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslibs.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset return arg
def to_numeric(arg, errors="raise", downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Please note that precision loss may occur if really large numbers are passed in. Due to the internal limitations of `ndarray`, if numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are passed in, it is very likely they will be converted to float so that they can stored in an `ndarray`. These warnings apply similarly to `Series` since it internally leverages `ndarray`. Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. Returns ------- ret Numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. See Also -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. DataFrame.convert_dtypes : Convert dtypes. Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndex): is_index = True if needs_i8_conversion(arg.dtype): values = arg.asi8 else: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: raise TypeError("arg must be a list, tuple, 1-d array, or Series") else: values = arg values_dtype = getattr(values, "dtype", None) if is_numeric_dtype(values_dtype): pass elif is_datetime_or_timedelta_dtype(values_dtype): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except (ValueError, TypeError): if errors == "raise": raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): typecodes = None if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: dtype = np.dtype(dtype) if dtype.itemsize <= values.dtype.itemsize: values = maybe_downcast_numeric(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix'): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). .. versionadded: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default is 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - If 'julian', unit must be 'D', and origin is set to beginning of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. .. versionadded: 0.20.0 Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') Timestamp('2017-03-22 15:16:45') >>> pd.to_datetime(1490195805433502912, unit='ns') Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent unexpected behavior use a fixed-width exact type. Using a non-unix epoch origin >>> pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) 0 1960-01-02 1 1960-01-03 2 1960-01-04 """ from pandas.core.indexes.datetimes import DatetimeIndex tz = 'utc' if utc else None def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return None # handle origin if origin == 'julian': original = arg j0 = tslib.Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = tslib.Timestamp.max.to_julian_date() - j0 j_min = tslib.Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslib.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) elif origin not in ['unix', 'julian']: # arg must be a numeric original = arg if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = tslib.Timestamp(origin) - tslib.Timestamp(0) except tslib.OutOfBoundsDatetime: raise tslib.OutOfBoundsDatetime( "origin {} is Out of Bounds".format(origin)) except ValueError: raise ValueError("origin {} cannot be converted " "to a Timestamp".format(origin)) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslib.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, False, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] return result
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 See Also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_datetime : Convert argument to datetime. pandas.to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize <= values.dtype.itemsize: values = maybe_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. Adjust input argument to the specified origin Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be adjusted origin : 'julian' or Timestamp origin offset for the arg unit : string passed unit from to_datetime, must be 'D' Returns ------- ndarray or scalar of adjusted date(s) """ if origin == 'julian': original = arg j0 = Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except TypeError: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) else: # arg must be numeric if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: raise tslibs.OutOfBoundsDatetime( "origin {origin} is Out of Bounds".format(origin=origin)) except ValueError: raise ValueError("origin {origin} cannot be converted " "to a Timestamp".format(origin=origin)) if offset.tz is not None: raise ValueError( "origin offset {} must be tz-naive".format(offset)) offset -= Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslibs.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset return arg
def assertPandasEqual(self, left, right, check_exact=True): if isinstance(left, pd.DataFrame) and isinstance(right, pd.DataFrame): try: if LooseVersion(pd.__version__) >= LooseVersion("1.1"): kwargs = dict(check_freq=False) else: kwargs = dict() if LooseVersion(pd.__version__) < LooseVersion("1.1.1"): # Due to https://github.com/pandas-dev/pandas/issues/35446 check_exact = (check_exact and all([ is_numeric_dtype(dtype) for dtype in left.dtypes ]) and all( [is_numeric_dtype(dtype) for dtype in right.dtypes])) assert_frame_equal( left, right, check_index_type=("equiv" if len(left.index) > 0 else False), check_column_type=("equiv" if len(left.columns) > 0 else False), check_exact=check_exact, **kwargs, ) except AssertionError as e: msg = (str(e) + "\n\nLeft:\n%s\n%s" % (left, left.dtypes) + "\n\nRight:\n%s\n%s" % (right, right.dtypes)) raise AssertionError(msg) from e elif isinstance(left, pd.Series) and isinstance(right, pd.Series): try: if LooseVersion(pd.__version__) >= LooseVersion("1.1"): kwargs = dict(check_freq=False) else: kwargs = dict() if LooseVersion(pd.__version__) < LooseVersion("1.1.1"): # Due to https://github.com/pandas-dev/pandas/issues/35446 check_exact = (check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype)) assert_series_equal( left, right, check_index_type=("equiv" if len(left.index) > 0 else False), check_exact=check_exact, **kwargs, ) except AssertionError as e: msg = (str(e) + "\n\nLeft:\n%s\n%s" % (left, left.dtype) + "\n\nRight:\n%s\n%s" % (right, right.dtype)) raise AssertionError(msg) from e elif isinstance(left, pd.Index) and isinstance(right, pd.Index): try: if LooseVersion(pd.__version__) < LooseVersion("1.1.1"): # Due to https://github.com/pandas-dev/pandas/issues/35446 check_exact = (check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype)) assert_index_equal(left, right, check_exact=check_exact) except AssertionError as e: msg = (str(e) + "\n\nLeft:\n%s\n%s" % (left, left.dtype) + "\n\nRight:\n%s\n%s" % (right, right.dtype)) raise AssertionError(msg) from e else: raise ValueError("Unexpected values: (%s, %s)" % (left, right))
def align_text(series: pd.Series) -> List[str]: """ Right align numeric data; Left align text data""" return [ "text-align: right;" if is_numeric_dtype(series) else "text-align: left" for v in series ]
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalar = False if isinstance(arg, pd.Series): is_series = True values = arg.values elif isinstance(arg, pd.Index): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif isscalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = _ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize <= values.dtype.itemsize: values = maybe_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return Index(values, name=arg.name) elif is_scalar: return values[0] else: return values
def autogen_schema(X, ordinal_max_items=2, feature_names=None, feature_types=None): """ Generates data schema for a given dataset as JSON representable. Args: X: Dataframe/ndarray to build schema from. ordinal_max_items: If a numeric column's cardinality is at most this integer, consider it as ordinal instead of continuous. feature_names: Feature names feature_types: Feature types Returns: A dictionary - schema that encapsulates column information, such as type and domain. """ schema = OrderedDict() col_number = 0 if isinstance(X, np.ndarray): log.warning( "Passing a numpy array to schema autogen when it should be dataframe." ) if feature_names is None: X = pd.DataFrame(X, columns=[ 'col_' + str(i) for i in range(X.shape[1]) ]).infer_objects() else: X = pd.DataFrame(X, columns=feature_names).infer_objects() if isinstance(X, NDFrame): for name, col_dtype in zip(X.dtypes.index, X.dtypes): schema[name] = {} if is_numeric_dtype(col_dtype): # schema[name]['type'] = 'continuous' # TODO: Fix this once we know it works. if len(set(X[name])) > ordinal_max_items: schema[name]['type'] = 'continuous' else: # TODO: Work with ordinal later. schema[name]['type'] = 'categorical' # schema[name]['type'] = 'ordinal' # schema[name]['order'] = list(set(X[name])) elif is_string_dtype(col_dtype): schema[name]['type'] = 'categorical' else: warnings.warn("Unknown column: " + name, RuntimeWarning) schema[name]['type'] = 'unknown' schema[name]['column_number'] = col_number col_number += 1 # Override if feature_types is passed as arg. if feature_types is not None: for idx, name in enumerate(X.dtypes.index): schema[name]['type'] = feature_types[idx] else: raise TypeError( "GA2M only supports numpy arrays or pandas dataframes.") return schema
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function( kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function( kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '{kind}{itemsize}'.format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{dtype} dtype not supported".format(dtype=values.dtype) ) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {how} operations".format(how=how) ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {how} operations".format(how=how) ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize ) else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, codes, func, is_datetimelike, min_count ) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: names = self._name_functions[how]() # type: Optional[List[str]] else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=False): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded:: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). .. versionadded:: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default is 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - If 'julian', unit must be 'D', and origin is set to beginning of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. .. versionadded:: 0.20.0 cache : boolean, default False If True, use a cache of unique, converted dates to apply the datetime conversion. May produce sigificant speed-up when parsing duplicate date strings, especially ones with timezone offsets. .. versionadded:: 0.22.0 Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') Timestamp('2017-03-22 15:16:45') >>> pd.to_datetime(1490195805433502912, unit='ns') Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent unexpected behavior use a fixed-width exact type. Using a non-unix epoch origin >>> pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) 0 1960-01-02 1 1960-01-03 2 1960-01-04 See also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_timedelta : Convert argument to timedelta. """ from pandas.core.indexes.datetimes import DatetimeIndex tz = 'utc' if utc else None def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601 ) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = conversion.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return None # handle origin if origin == 'julian': original = arg j0 = tslib.Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = tslib.Timestamp.max.to_julian_date() - j0 j_min = tslib.Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslib.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) elif origin not in ['unix', 'julian']: # arg must be a numeric original = arg if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = tslib.Timestamp(origin) except tslib.OutOfBoundsDatetime: raise tslib.OutOfBoundsDatetime( "origin {origin} is Out of Bounds".format(origin=origin)) except ValueError: raise ValueError("origin {origin} cannot be converted " "to a Timestamp".format(origin=origin)) if offset.tz is not None: raise ValueError( "origin offset {} must be tz-naive".format(offset)) offset -= tslib.Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslib.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: result = arg.map(cache_array) else: from pandas import Series values = _convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, box, errors, name=arg.name) else: result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): cache_array = _maybe_cache(arg, format, cache, tz, _convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, box, errors) else: result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] return result
def coerce_to_array(values, mask=None, copy: bool = False): """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ if isinstance(values, BooleanArray): if mask is not None: raise ValueError("cannot pass mask for BooleanArray input") values, mask = values._data, values._mask if copy: values = values.copy() mask = mask.copy() return values, mask mask_values = None if isinstance(values, np.ndarray) and values.dtype == np.bool_: if copy: values = values.copy() elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype): mask_values = isna(values) values_bool = np.zeros(len(values), dtype=bool) values_bool[~mask_values] = values[~mask_values].astype(bool) if not np.all(values_bool[~mask_values].astype(values.dtype) == values[~mask_values]): raise TypeError("Need to pass bool-like values") values = values_bool else: values_object = np.asarray(values, dtype=object) inferred_dtype = lib.infer_dtype(values_object, skipna=True) integer_like = ("floating", "integer", "mixed-integer-float") if inferred_dtype not in ("boolean", "empty") + integer_like: raise TypeError("Need to pass bool-like values") mask_values = isna(values_object) values = np.zeros(len(values), dtype=bool) values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's if inferred_dtype in integer_like: if not np.all(values[~mask_values].astype(float) == values_object[~mask_values].astype(float)): raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) elif mask is None: mask = mask_values else: if isinstance(mask, np.ndarray) and mask.dtype == np.bool_: if mask_values is not None: mask = mask | mask_values else: if copy: mask = mask.copy() else: mask = np.array(mask, dtype=bool) if mask_values is not None: mask = mask | mask_values if not values.ndim == 1: raise ValueError("values must be a 1D list-like") if not mask.ndim == 1: raise ValueError("mask must be a 1D list-like") return values, mask