def test_applymap_round(nelem, masked): # Generate data np.random.seed(0) data = np.random.random(nelem) * 100 if masked: # Make mask bitmask = utils.random_bitmask(nelem) boolmask = np.asarray(utils.expand_bits_to_bytes(bitmask), dtype=np.bool)[:nelem] data[~boolmask] = np.nan sr = Series(data) if masked: # Mask the Series sr = sr.set_mask(bitmask) # Call applymap out = sr.applymap(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) if masked: # Fill masked values out = out.fillna(np.nan) # Check expect = np.round(data) got = out.to_array() np.testing.assert_array_almost_equal(expect, got)
def _compute_drop_idx(self): """Helper to compute indices to drop from category to drop""" if self.drop is None: return None elif isinstance(self.drop, str) and self.drop == 'first': return {feature: 0 for feature in self._encoders.keys()} elif isinstance(self.drop, (dict, list)): if isinstance(self.drop, list): self.drop = dict(zip(range(len(self.drop)), self.drop)) if len(self.drop.keys()) != len(self._encoders): msg = ("`drop` should have as many columns as the number " "of features ({}), got {}") raise ValueError(msg.format(len(self._encoders), len(self.drop.keys()))) drop_idx = dict() for feature in self.drop.keys(): self.drop[feature] = Series(self.drop[feature]) if len(self.drop[feature]) != 1: msg = ("Trying to drop multiple values for feature {}, " "this is not supported.").format(feature) raise ValueError(msg) cats = self._encoders[feature].classes_ if not self.drop[feature].isin(cats).all(): msg = ("Some categories for feature {} were supposed " "to be dropped, but were not found in the encoder " "categories.".format(feature)) raise ValueError(msg) cats = Series(cats) idx = cats.isin(self.drop[feature]) drop_idx[feature] = cp.asarray(cats[idx].index) return drop_idx else: msg = ("Wrong input for parameter `drop`. Expected " "'first', None or a dict, got {}") raise ValueError(msg.format(type(self.drop)))
def read_polygon_shapefile(filename): """ Reads polygon geometry from an ESRI shapefile into GPU memory. Parameters ---------- filename : str, pathlike ESRI Shapefile file path (usually ends in ``.shp``) Returns ------- result : tuple (cudf.Series, cudf.Series, cudf.DataFrame) poly_offsets : cudf.Series(dtype=np.int32) Offsets of the first ring in each polygon ring_offsets : cudf.Series(dtype=np.int32) Offsets of the first point in each ring points : cudf.DataFrame DataFrame of all points in the shapefile x : cudf.Series(dtype=np.float64) x-components of each polygon's points y : cudf.Series(dtype=np.float64) y-components of each polygon's points """ result = cpp_read_polygon_shapefile(filename) f_pos = Series(result[0], name="f_pos") r_pos = Series(result[1], name="r_pos") return (f_pos, r_pos, DataFrame({"x": result[2], "y": result[3]}))
def test_categorical_compare_ordered(data): cat1 = data[0] cat2 = data[1] pdsr1 = pd.Series(cat1) pdsr2 = pd.Series(cat2) sr1 = Series(cat1) sr2 = Series(cat2) dsr1 = dgd.from_cudf(sr1, npartitions=2) dsr2 = dgd.from_cudf(sr2, npartitions=2) # Test equality out = dsr1 == dsr1 assert out.dtype == np.bool_ assert np.all(out.compute().to_array()) assert np.all(pdsr1 == pdsr1) # Test inequality out = dsr1 != dsr1 assert not np.any(out.compute().to_array()) assert not np.any(pdsr1 != pdsr1) assert dsr1.cat.ordered assert pdsr1.cat.ordered # Test ordered operators np.testing.assert_array_equal( pdsr1 < pdsr2, (dsr1 < dsr2).compute().to_array() ) np.testing.assert_array_equal( pdsr1 > pdsr2, (dsr1 > dsr2).compute().to_array() )
def test_generic_ptx(dtype): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_col = Series(lhs_arr)._column rhs_arr = np.random.random(size).astype(dtype) rhs_col = Series(rhs_arr)._column def generic_function(a, b): return a ** 3 + b nb_type = numpy_support.from_dtype(cudf.dtype(dtype)) type_signature = (nb_type, nb_type) ptx_code, output_type = compile_ptx( generic_function, type_signature, device=True ) dtype = numpy_support.as_dtype(output_type).type out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype) result = lhs_arr ** 3 + rhs_arr np.testing.assert_almost_equal(result, out_col.to_array())
def test_series_sort_values_ignore_index(ignore_index): gsr = Series([1, 3, 5, 2, 4]) psr = gsr.to_pandas() expect = psr.sort_values(ignore_index=ignore_index) got = gsr.sort_values(ignore_index=ignore_index) assert_eq(expect, got)
def test_get(data, index, expectation): with expectation: expect = Series(data).list.get(index) if expectation == does_not_raise(): ds = dgd.from_cudf(Series(data), 5) assert_eq(expect, ds.list.get(index).compute())
def test_take(data, list_indices, expectation): with expectation: expect = Series(data).list.take(list_indices) if expectation == does_not_raise(): ds = dgd.from_cudf(Series(data), 5) assert_eq(expect, ds.list.take(list_indices).compute())
def test_series_nlargest_nelem(nelem): np.random.seed(0) elems = np.random.random(nelem) gds = Series(elems).nlargest(nelem) pds = pd.Series(elems).nlargest(nelem) assert (pds == gds.to_pandas()).all().all()
def test_vectorizer_empty_token_case(): """ We ignore empty tokens right now but sklearn treats them as a character we might want to look into this more but this should not be a concern for most piplines """ corpus = [ "a b ", ] # we have extra null token here # we slightly diverge from sklearn here as not treating it as a token res = CountVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkCountVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) cp.testing.assert_array_equal(res.todense(), ref.toarray()) res = HashingVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkHashVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_series_floor(): arr = np.random.random(100) * 100 sr = Series(arr) with pytest.warns( FutureWarning, match="Series.floor and DataFrame.floor are deprecated" ): sr = sr.floor() np.testing.assert_equal(sr.to_numpy(), np.floor(arr))
def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): np_data = data.astype(from_dtype) gdf_col = Series(np_data)._column np_casted = np_data.astype(to_dtype) gdf_casted = gdf_col.astype(to_dtype) np.testing.assert_equal(np_casted, gdf_casted.to_array())
def test_max(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) got = sr.max() expect = dtype(data.max()) assert expect == got
def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series((100 * np.random.random(nelem))) orig = sr.to_array() got = sr.sort_values().sort_index(ascending=asc).to_array() if not asc: # Reverse the array for descending sort got = got[::-1] np.testing.assert_array_equal(orig, got)
def test_series_sort_index(nelem, asc): np.random.seed(0) sr = Series(100 * np.random.random(nelem)) psr = sr.to_pandas() expected = psr.sort_index(ascending=asc) got = sr.sort_index(ascending=asc) assert_eq(expected, got)
def test_sum(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) got = sr.sum() expect = data.sum() significant = 4 if dtype == np.float32 else 6 np.testing.assert_approx_equal(expect, got, significant=significant)
def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): pd_data = pd.Series(data.copy()) np_data = np.array(pd_data) gdf_data = Series(pd_data) np_casted = np_data.astype(np.int64).astype(dtype) gdf_casted = gdf_data.astype(np.int64).astype(dtype) np.testing.assert_equal(np_casted, gdf_casted.to_array())
def test_pickle_categorical_column(slices): sr = Series(["a", "b", None, "a", "c", "b"]).astype("category") sliced_sr = sr.iloc[slices] input_col = sliced_sr._column pickled = pickle.dumps(input_col) out = pickle.loads(pickled) assert_eq(Series(out), Series(input_col))
def test_pickle_string_column(slices): sr = Series(["a", "b", None, "a", "c", "b"]) sliced_sr = sr.iloc[slices] input_col = sliced_sr._column pickled = pickle.dumps(input_col) out = pickle.loads(pickled) assert_eq(Series(out), Series(input_col))
def test_fillna(): _, schema, darr = read_data() gar = GpuArrowReader(schema, darr) masked_col = gar[8] sr = Series(data=masked_col.data) dense = sr.nans_to_nulls().fillna(123) np.testing.assert_equal(123, dense.to_array()) assert len(dense) == len(sr) assert dense.null_count == 0
def test_series_argsort(nelem, dtype, asc): np.random.seed(0) sr = Series((100 * np.random.random(nelem)).astype(dtype)) res = sr.argsort(ascending=asc) if asc: expected = np.argsort(sr.to_array(), kind="mergesort") else: expected = np.argsort(sr.to_array() * -1, kind="mergesort") np.testing.assert_array_equal(expected, res.to_array())
def test_pickle_series(named): np.random.seed(0) if named: ser = Series(np.random.random(10), name="a") else: ser = Series(np.random.random(10)) pickled = pickle.dumps(ser) out = pickle.loads(pickled) assert (ser == out).all()
def test_sorting(data, ascending, na_position, ignore_index): expect = Series(data).list.sort_values(ascending=ascending, na_position=na_position, ignore_index=ignore_index) got = (dgd.from_cudf(Series(data), 5).list.sort_values( ascending=ascending, na_position=na_position, ignore_index=ignore_index, ).compute().reset_index(drop=True)) assert_eq(expect, got)
def test_applymap_python_lambda(dtype, udf, testfunc): size = 500 lhs_arr = np.random.random(size).astype(dtype) lhs_ser = Series(lhs_arr) out_ser = lhs_ser.applymap(udf) result = testfunc(lhs_arr) np.testing.assert_almost_equal(result, out_ser.to_array())
def _categories_equal(self, new_categories, **kwargs): cur_categories = self._column.categories if len(new_categories) != len(cur_categories): return False # if order doesn't matter, sort before the equals call below if not kwargs.get("ordered", self.ordered): from cudf.core.series import Series cur_categories = Series(cur_categories).sort_values() new_categories = Series(new_categories).sort_values() return cur_categories.equals(new_categories)
def can_cast_safely(self, to_dtype): """ Returns true if all the values in self can be safely cast to dtype """ if self.dtype.kind == to_dtype.kind: if self.dtype <= to_dtype: return True else: # Kinds are the same but to_dtype is smaller if "float" in to_dtype.name: info = np.finfo(to_dtype) elif "int" in to_dtype.name: info = np.iinfo(to_dtype) min_, max_ = info.min, info.max if (self.min() > min_) and (self.max() < max_): return True else: return False # want to cast int to float elif to_dtype.kind == "f" and self.dtype.kind == "i": info = np.finfo(to_dtype) biggest_exact_int = 2 ** (info.nmant + 1) if (self.min() >= -biggest_exact_int) and ( self.max() <= biggest_exact_int ): return True else: from cudf import Series if ( Series(self).astype(to_dtype).astype(self.dtype) == Series(self) ).all(): return True else: return False # want to cast float to int: elif to_dtype.kind == "i" and self.dtype.kind == "f": info = np.iinfo(to_dtype) min_, max_ = info.min, info.max # best we can do is hope to catch it here and avoid compare if (self.min() >= min_) and (self.max() <= max_): from cudf import Series if (Series(self) % 1 == 0).all(): return True else: return False else: return False
def test_multiindex_take(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.index.take([0]), gdf.index.take([0])) assert_eq(pdf.index.take(np.array([0])), gdf.index.take(np.array([0]))) from cudf import Series assert_eq(pdf.index.take(Series([0])), gdf.index.take(Series([0]))) assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1])) assert_eq(pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1]))) assert_eq(pdf.index.take(Series([0, 1])), gdf.index.take(Series([0, 1])))
def codes(self): from cudf import Series data = self._parent.data if self._parent.has_null_mask: mask = self._parent.mask null_count = self._parent.null_count return Series.from_masked_array( data=data.mem, mask=mask.mem, null_count=null_count ) else: return Series(data, name=self._parent.name)
def remove_categories(self, removals, **kwargs): from cudf import Series cats = self.categories.to_series() removals = Series(removals, dtype=cats.dtype) removals_mask = removals.isin(cats) # ensure all the removals are in the current categories # list. If not, raise an error to match Pandas behavior if not removals_mask.all(): vals = removals[~removals_mask].to_array() msg = "removals must all be in old categories: {}".format(vals) raise ValueError(msg) return self.set_categories(cats[~cats.isin(removals)], **kwargs)
def test_applymap_change_out_dtype(): # Test for changing the out_dtype using applymap data = list(range(10)) sr = Series(data) out = sr.applymap(lambda x: float(x), out_dtype=float) # Check expect = np.array(data, dtype=float) got = out.to_array() np.testing.assert_array_equal(expect, got)