def test_onehot_drop_idx_first(client): X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(sparse=False, drop='first') sk_enc = SkOneHotEncoder(sparse=False, drop='first') ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas())
def transform(self, columns, gdf: cudf.DataFrame) -> cudf.DataFrame: tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how) new_gdf = new_gdf.sort_values(tmp) new_gdf.drop(columns=[tmp], inplace=True) gdf.drop(columns=[tmp], inplace=True) new_gdf.reset_index(drop=True, inplace=True) return new_gdf
def test_query_env_changing(): df = DataFrame() df["a"] = aa = np.arange(100) expr = "a < @c" # first attempt c = 10 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got["a"].to_array()) # change env c = 50 got = df.query(expr) np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
def test_onehot_inverse_transform_handle_unknown(as_array): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]}) if as_array: X = _from_df_to_cupy(X) ref = DataFrame({0: [None, ord('b')], 1: [0, 2]}) enc = OneHotEncoder(handle_unknown='ignore') enc = enc.fit(X) df = enc.inverse_transform(Y_ohe) assert_inverse_equal(df, ref)
def test_onehot_fit_handle_unknown(cluster): client = Client(cluster) X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) enc = OneHotEncoder(handle_unknown='error', categories=Y) with pytest.raises(KeyError): enc.fit(X) enc = OneHotEncoder(handle_unknown='ignore', categories=Y) enc.fit(X) client.close()
def read_polygon_shapefile(filename): """Reads a pair of .shp and .shx files into a cudf DataFrame""" result = cpp_read_polygon_shapefile(filename) return ( DataFrame({ "f_pos": result[0], "r_pos": result[1] }), DataFrame({ "x": result[2], "y": result[3] }), )
def test_onehot_inverse_transform_handle_unknown(cluster): client = Client(cluster) X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) Y_ohe = da.from_array(Y_ohe) enc = OneHotEncoder(handle_unknown='ignore') enc = enc.fit(X) df = enc.inverse_transform(Y_ohe) ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]}) assert_frame_equal(df.compute().to_pandas(), ref.to_pandas()) client.close()
def _popn(self, n): """ Returns a copy of this index without the left-most n values. Removes n names, labels, and codes in order to build a new index for results. """ from cudf import DataFrame codes = DataFrame() for idx in self.codes.columns[n:]: codes.add_column(idx, self.codes[idx]) result = MultiIndex(self.levels[n:], codes) result.names = self.names[n:] return result
def test_onehot_fit_handle_unknown(as_array): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) if as_array: X = _from_df_to_cupy(X) Y = _from_df_to_cupy(Y) enc = OneHotEncoder(handle_unknown='error', categories=Y) with pytest.raises(KeyError): enc.fit(X) enc = OneHotEncoder(handle_unknown='ignore', categories=Y) enc.fit(X)
def test_categorical_basic(data): cat = data.copy() pdsr = pd.Series(cat) sr = Series(cat) dsr = dgd.from_cudf(sr, npartitions=2) result = dsr.compute() np.testing.assert_array_equal(cat.codes, result.to_array()) assert dsr.dtype.to_pandas() == pdsr.dtype # Test attributes assert pdsr.cat.ordered == dsr.cat.ordered assert tuple(pdsr.cat.categories) == tuple(dsr.cat.categories) np.testing.assert_array_equal(pdsr.cat.codes.data, result.to_array()) np.testing.assert_array_equal(pdsr.cat.codes.dtype, dsr.cat.codes.dtype) string = str(result) expect_str = """ 0 a 1 a 2 b 3 c 4 a """ assert all(x == y for x, y in zip(string.split(), expect_str.split())) from cudf.tests.utils import assert_eq df = DataFrame() df["a"] = ["xyz", "abc", "def"] * 10 pdf = df.to_pandas() cddf = dgd.from_cudf(df, 1) cddf["b"] = cddf["a"].astype("category") ddf = dd.from_pandas(pdf, 1) ddf["b"] = ddf["a"].astype("category") assert_eq(ddf._meta_nonempty["b"], cddf._meta_nonempty["b"]) with pytest.raises(NotImplementedError): cddf["b"].cat.categories with pytest.raises(NotImplementedError): ddf["b"].cat.categories cddf = cddf.categorize() ddf = ddf.categorize() assert_eq(ddf["b"].cat.categories, cddf["b"].cat.categories) assert_eq(ddf["b"].cat.ordered, cddf["b"].cat.ordered)
def test_dataframe_sort_values_ignore_index(index, ignore_index): gdf = DataFrame({ "a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1] }) gdf = gdf.set_index(index) pdf = gdf.to_pandas() expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) assert_eq(expect, got)
def _to_frame(self): from cudf import DataFrame # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, column in enumerate(df.columns): # use merge as a replace fn level = DataFrame({'idx': Series(cudautils.arange(len( self.levels[idx]), dtype=df[column].dtype)), 'level': self.levels[idx]}) code = DataFrame({'idx': df[column]}) df[column] = code.merge(level).level return df
def test_dataframe_masked_slicing(nelem, slice_start, slice_end): gdf = DataFrame() gdf["a"] = list(range(nelem)) gdf["b"] = list(range(nelem, 2 * nelem)) gdf["a"] = gdf["a"].set_mask(utils.random_bitmask(nelem)) gdf["b"] = gdf["b"].set_mask(utils.random_bitmask(nelem)) def do_slice(x): return x[slice_start:slice_end] expect = do_slice(gdf.to_pandas()) got = do_slice(gdf).to_pandas() assert_eq(expect, got, check_dtype=False)
def _get_row_major(self, df, row_tuple): valid_indices = self._compute_validity_mask(df, row_tuple) from cudf import Series result = df.take(Series(valid_indices)) # Build new index - INDEX based MultiIndex # --------------- from cudf import DataFrame out_index = DataFrame() # Select the last n-k columns where n is the number of source # levels and k is the length of the indexing tuple for k in range(len(row_tuple), len(df.index.levels)): out_index.add_column(df.index.names[k], df.index.codes[df.index.codes.columns[k]]) # If there's only one column remaining in the output index, convert # it into a StringIndex and name the final index values according # to the proper codes. if len(out_index.columns) == 1: out_index = [] for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 out_index.append(result.index.levels[ len(result.index.codes.columns)-1][val]) # TODO: Warning! The final index column could be arbitrarily # ordered integers, not Strings, so we need to check for that # dtype and produce a GenericIndex instead of a StringIndex out_index = StringIndex(out_index) out_index.name = result.index.names[len(result.index.names)-1] result.index = out_index else: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) if(len(out_index.columns)) > 0: result.reset_index(drop=True) result.index = result.index._popn(len(row_tuple)) return result
def test_str_slice(): df = DataFrame({"a": ["abc,def,123", "xyz,hi,bye"]}) ddf = dgd.from_cudf(df, 1) pdf = df.to_pandas() dd.assert_eq( pdf.a.str.split(",", expand=True, n=1), ddf.a.str.split(",", expand=True, n=1), ) dd.assert_eq( pdf.a.str.split(",", expand=True, n=2), ddf.a.str.split(",", expand=True, n=2), )
def test_get_hostname_split_df(): input_df = DataFrame({ "hostname": ["forums.news.cnn.com.ac", "forums.news.cnn.ac", "b.cnn.com"] }) expected_output_df = DataFrame({ 4: ["ac", "", ""], 3: ["com", "ac", ""], 2: ["cnn", "cnn", "com"], 1: ["news", "news", "cnn"], 0: ["forums", "forums", "b"], }) actual_output_df = dns.get_hostname_split_df(input_df["hostname"]) assert actual_output_df.equals(expected_output_df)
def test_onehot_drop_one_of_each(cluster): client = Client(cluster) X_ary = [['c', 2, 'a'], ['b', 2, 'b']] X = DataFrame({'chars': ['c', 'b'], 'int': [2, 2], 'letters': ['a', 'b']}) ddf = dask_cudf.from_cudf(X, npartitions=2) drop = dict({'chars': 'b', 'int': 2, 'letters': 'b'}) enc = OneHotEncoder(sparse=False, drop=drop) sk_enc = SkOneHotEncoder(sparse=False, drop=['b', 2, 'b']) ohe = enc.fit_transform(ddf) ref = sk_enc.fit_transform(X_ary) cp.testing.assert_array_equal(ohe.compute(), ref) inv = enc.inverse_transform(ohe) assert_frame_equal(inv.compute().to_pandas(), X.to_pandas()) client.close()
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df["fo"] = Series(values, index=Index(indices)) out = df.one_hot_encoding( "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 ) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_memory_usage_dataframe(): np.random.seed(0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) df["vals"] = hvals = np.random.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes sizeof = df.memory_usage().sum() assert sizeof >= nbytes serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) # assert at least sizeof bytes were serialized assert serialized_nbytes >= sizeof
def test_categorical_categories(): df = DataFrame( {"a": ["a", "b", "c", "d", "e", "e", "a", "d"], "b": range(8)} ) df["a"] = df["a"].astype("category") pdf = df.to_pandas(nullable_pd_dtype=False) ddf = dgd.from_cudf(df, 2) dpdf = dd.from_pandas(pdf, 2) dd.assert_eq( ddf.a.cat.categories.to_series().to_pandas(nullable_pd_dtype=False), dpdf.a.cat.categories.to_series(), check_index=False, )
def read_avro( filepath_or_buffer, engine="cudf", columns=None, skiprows=None, num_rows=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_avro` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return DataFrame._from_table( libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows, num_rows)) else: raise NotImplementedError("read_avro currently only supports cudf")
def test_dataframe_iloc(nelem): gdf = DataFrame() gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( np.int32 ) gdf["b"] = hb = np.random.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha pdf["b"] = hb assert_eq(gdf.iloc[-1:1], pdf.iloc[-1:1]) assert_eq(gdf.iloc[nelem - 1 : -1], pdf.iloc[nelem - 1 : -1]) assert_eq(gdf.iloc[0 : nelem - 1], pdf.iloc[0 : nelem - 1]) assert_eq(gdf.iloc[0:nelem], pdf.iloc[0:nelem]) assert_eq(gdf.iloc[1:1], pdf.iloc[1:1]) assert_eq(gdf.iloc[1:2], pdf.iloc[1:2]) assert_eq(gdf.iloc[nelem - 1 : nelem + 1], pdf.iloc[nelem - 1 : nelem + 1]) assert_eq(gdf.iloc[nelem : nelem * 2], pdf.iloc[nelem : nelem * 2]) assert_eq(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem]) assert_eq(gdf.iloc[-1], pdf.iloc[-1]) assert_eq(gdf.iloc[0], pdf.iloc[0]) assert_eq(gdf.iloc[1], pdf.iloc[1]) assert_eq(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1])
def _to_frame(this_index, index=True, name=None): """Create a DataFrame with a column containing this Index Parameters ---------- index : boolean, default True Set the index of the returned DataFrame as the original Index name : str, default None Name to be used for the column Returns ------- DataFrame cudf DataFrame """ from cudf import DataFrame if name is not None: col_name = name elif this_index.name is None: col_name = 0 else: col_name = this_index.name return DataFrame( {col_name: this_index._values}, index=this_index if index else None )
def read_polygon_shapefile(filename): """ Reads polygon geometry from an ESRI shapefile into GPU memory. Parameters ---------- filename : str, pathlike ESRI Shapefile file path (usually ends in ``.shp``) Returns ------- result : tuple (cudf.Series, cudf.Series, cudf.DataFrame) poly_offsets : cudf.Series(dtype=np.int32) Offsets of the first ring in each polygon ring_offsets : cudf.Series(dtype=np.int32) Offsets of the first point in each ring points : cudf.DataFrame DataFrame of all points in the shapefile x : cudf.Series(dtype=np.float64) x-components of each polygon's points y : cudf.Series(dtype=np.float64) y-components of each polygon's points """ result = cpp_read_polygon_shapefile(filename) f_pos = Series(result[0], name="f_pos") r_pos = Series(result[1], name="r_pos") return (f_pos, r_pos, DataFrame({"x": result[2], "y": result[3]}))
def test_factorize_index_obj(ncats, nelem): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) df = df.set_index("cats") uvals, labels = df.index.factorize() np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.get(), handcoded)
def _cubic_spline_coefficients(x, y, ids, prefix_sums): x_c = x._column y_c = y._column ids_c = ids._column prefix_c = prefix_sums._column return DataFrame._from_data( *cubicspline_coefficients(x_c, y_c, ids_c, prefix_c))
def test_dataframe_take(ntake): np.random.seed(0) df = DataFrame() nelem = 123 df["ii"] = np.random.randint(0, 20, nelem) df["ff"] = np.random.random(nelem) take_indices = np.random.randint(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) assert actual.ii.null_count == 0 assert actual.ff.null_count == 0 assert_eq(actual, expected)
def test_onehot_transform_handle_unknown(client): X = DataFrame({'chars': ['a', 'b'], 'int': [0, 2]}) Y = DataFrame({'chars': ['c', 'b'], 'int': [0, 2]}) X = dask_cudf.from_cudf(X, npartitions=2) Y = dask_cudf.from_cudf(Y, npartitions=2) enc = OneHotEncoder(handle_unknown='error', sparse=False) enc = enc.fit(X) with pytest.raises(KeyError): enc.transform(Y).compute() enc = OneHotEncoder(handle_unknown='ignore', sparse=False) enc = enc.fit(X) ohe = enc.transform(Y) ref = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) cp.testing.assert_array_equal(ohe.compute(), ref)
def _get_column_major(self, df, row_tuple): from cudf import Series from cudf import DataFrame valid_indices = self._get_valid_indices_by_tuple( df.columns, row_tuple, len(df._cols)) result = df._take_columns(valid_indices) if isinstance(row_tuple, (numbers.Number, slice)): row_tuple = [row_tuple] if len(result) == 0 and len(result.columns) == 0: result_columns = df.columns.copy(deep=False) clear_codes = DataFrame() for name in df.columns.names: clear_codes[name] = Series([]) result_columns._codes = clear_codes result_columns._source_data = clear_codes result.columns = result_columns elif len(row_tuple) < len( self.levels) and (not slice(None) in row_tuple and not isinstance(row_tuple[0], slice)): columns = self._popn(len(row_tuple)) result.columns = columns.take(valid_indices) else: result.columns = self.take(valid_indices) if len(result.columns.levels) == 1: columns = [] for code in result.columns.codes[result.columns.codes.columns[0]]: columns.append(result.columns.levels[0][code]) name = result.columns.names[0] result.columns = as_index(columns, name=name) if len(row_tuple) == len(self.levels) and len(result.columns) == 1: result = list(result._cols.values())[0] return result
def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): np.random.seed(0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): colname = string.ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: idx = np.random.choice( num_rows, size=int(num_rows / 4), replace=False ) data[idx] = np.nan elif nulls == "all": data[:] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) assert_eq( got[by].reset_index(drop=True), expect[by].reset_index(drop=True) )