def test_datetime_series_binops_pandas(lhs_dtype, rhs_dtype): pd_data_1 = pd.Series( pd.date_range("20010101", "20020215", freq="400h", name="times")) pd_data_2 = pd.Series( pd.date_range("20010101", "20020215", freq="401h", name="times")) gdf_data_1 = Series(pd_data_1).astype(lhs_dtype) gdf_data_2 = Series(pd_data_2).astype(rhs_dtype) assert_eq(pd_data_1, gdf_data_1.astype("datetime64[ns]")) assert_eq(pd_data_2, gdf_data_2.astype("datetime64[ns]")) assert_eq(pd_data_1 < pd_data_2, gdf_data_1 < gdf_data_2) assert_eq(pd_data_1 > pd_data_2, gdf_data_1 > gdf_data_2) assert_eq(pd_data_1 == pd_data_2, gdf_data_1 == gdf_data_2) assert_eq(pd_data_1 <= pd_data_2, gdf_data_1 <= gdf_data_2) assert_eq(pd_data_1 >= pd_data_2, gdf_data_1 >= gdf_data_2)
def _enforce_str(y: cudf.Series) -> cudf.Series: """ Ensure that nvcategory is being given strings """ if y.dtype != "object": return y.astype("str") return y
def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): np_data = data.astype(from_dtype) gdf_col = Series(np_data)._column np_casted = np_data.astype(to_dtype) gdf_casted = gdf_col.astype(to_dtype) np.testing.assert_equal(np_casted, gdf_casted.to_array())
def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): pd_data = pd.Series(data.copy()) np_data = np.array(pd_data) gdf_data = Series(pd_data) np_casted = np_data.astype(np.int64).astype(dtype) gdf_casted = gdf_data.astype(np.int64).astype(dtype) np.testing.assert_equal(np_casted, gdf_casted.to_array())
def fit_transform(self, y: cudf.Series) -> cudf.Series: """ Simultaneously fit and transform an input This is functionally equivalent to (but faster than) `LabelEncoder().fit(y).transform(y)` """ self.dtype = y.dtype if y.dtype != cp.dtype('O') else str y = y.astype('category') self.classes_ = y._column.categories self._fitted = True return cudf.Series(y._column.codes, index=y.index)
def test_date_minmax(): np_data = np.random.normal(size=10 ** 3) gdf_data = Series(np_data) np_casted = np_data.astype("datetime64[ms]") gdf_casted = gdf_data.astype("datetime64[ms]") np_min = np_casted.min() gdf_min = gdf_casted.min() assert np_min == gdf_min np_max = np_casted.max() gdf_max = gdf_casted.max() assert np_max == gdf_max
def transform(self, y: cudf.Series) -> cudf.Series: """ Transform an input into its categorical keys. This is intended for use with small inputs relative to the size of the dataset. For fitting and transforming an entire dataset, prefer `fit_transform`. Parameters ---------- y : cudf.Series Input keys to be transformed. Its values should match the categories given to `fit` Returns ------- encoded : cudf.Series The ordinally encoded input series Raises ------ KeyError if a category appears that was not seen in `fit` """ if isinstance(y, pdSeries): y = cudf.from_pandas(y) self._check_is_fitted() y = y.astype('category') encoded = y.cat.set_categories(self.classes_)._column.codes encoded = cudf.Series(encoded, index=y.index) if encoded.has_nulls and self.handle_unknown == 'error': raise KeyError("Attempted to encode unseen key") return encoded
def inverse_transform(self, y: cudf.Series) -> cudf.Series: """ Revert ordinal label to original label Parameters ---------- y : cudf.Series, dtype=int32 Ordinal labels to be reverted Returns ------- reverted : cudf.Series Reverted labels """ # check LabelEncoder is fitted self._check_is_fitted() # check input type is cudf.Series if not isinstance(y, cudf.Series): raise TypeError( 'Input of type {} is not cudf.Series'.format(type(y))) # check if ord_label out of bound ord_label = y.unique() category_num = len(self.classes_) if self.handle_unknown == 'error': for ordi in ord_label.values_host: if ordi < 0 or ordi >= category_num: raise ValueError( 'y contains previously unseen label {}'.format(ordi)) y = y.astype(self.dtype) ran_idx = cudf.Series(cp.arange(len(self.classes_))).astype(self.dtype) reverted = y._column.find_and_replace(ran_idx, self.classes_, False) return cudf.Series(reverted)
def inverse_transform(self, y: cudf.Series) -> cudf.Series: """ Revert ordinal label to original label Parameters ---------- y : cudf.Series, pandas.Series, cupy.ndarray or numpy.ndarray dtype=int32 Ordinal labels to be reverted Returns ------- reverted : the same type as y Reverted labels """ # check LabelEncoder is fitted self._check_is_fitted() # check input type is cudf.Series y = self._to_cudf_series(y) # check if ord_label out of bound ord_label = y.unique() category_num = len(self.classes_) if self.handle_unknown == 'error': for ordi in ord_label.values_host: if ordi < 0 or ordi >= category_num: raise ValueError( 'y contains previously unseen label {}'.format(ordi)) y = y.astype(self.dtype) ran_idx = cudf.Series(cp.arange(len(self.classes_))).astype(self.dtype) reverted = y._column.find_and_replace(ran_idx, self.classes_, False) res = cudf.Series(reverted) return res
def _enforce_npint32(y: cudf.Series) -> cudf.Series: if y.dtype != np.int32: return y.astype(np.int32) return y
def test_str_null_to_datetime(data, dtype): psr = pd.Series(data) gsr = Series(data) assert_eq(psr.astype(dtype), gsr.astype(dtype))