def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category", "timedelta64[ns]", "datetime64[ns]"} # TODO: Remove uint32 below after this bug is fixed # https://github.com/pandas-dev/pandas/issues/37327 - {"uint32"} | {"list", "decimal64"}) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {df.shape}") self._current_buffer = df return df
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {df.shape}") self._current_buffer = df return df.to_csv()
def test_rolling_var_std_large(agg, ddof, center, seed, window_size): if PANDAS_GE_110: kwargs = {"check_freq": False} else: kwargs = {} iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) fupper_bound = math.sqrt(np.finfo(np.float64).max / window_size) flower_bound = -math.sqrt(abs(np.finfo(np.float64).min) / window_size) n_rows = 1_000 data = rand_dataframe( dtypes_meta=[ { "dtype": "int64", "null_frequency": 0.4, "cardinality": n_rows, "min_bound": ilower_bound, "max_bound": iupper_bound, }, { "dtype": "float64", "null_frequency": 0.4, "cardinality": n_rows, "min_bound": flower_bound, "max_bound": fupper_bound, }, { "dtype": "decimal64", "null_frequency": 0.4, "cardinality": n_rows, "min_bound": ilower_bound, "max_bound": iupper_bound, }, ], rows=n_rows, use_threads=False, seed=seed, ) pdf = data.to_pandas() gdf = cudf.from_pandas(pdf) expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof) got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof) import platform if platform.machine() == "aarch64": # Due to pandas-37051, pandas rolling var/std on uniform window is # not reliable. Skipping these rows when comparing. for col in expect: mask = (got[col].fillna(-1) != 0).to_pandas() expect[col] = expect[col][mask] got[col] = got[col][mask] assert_eq(expect[col], got[col], **kwargs) else: assert_eq(expect, got, **kwargs)
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category", "datetime64[ns]"} - cudf.utils.dtypes.TIMEDELTA_TYPES # TODO: Remove uint32 below after this bug is fixed # https://github.com/pandas-dev/pandas/issues/37327 - {"uint32"} | {"list", "decimal64"} ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") # TODO: Change this to write into # a BytesIO object once below issue is fixed # https://issues.apache.org/jira/browse/ARROW-10123 # file = io.BytesIO() df.to_parquet("temp_file") # file.seek(0) # self._current_buffer = copy.copy(file.read()) # return self._current_buffer self._df = df return "temp_file"
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES - {"datetime64[ns]"} ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df file_obj = io.BytesIO() pandas_to_orc( df, file_io_obj=file_obj, stripe_size=self._rand(len(df)), arrow_table_schema=table.schema, ) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) return (df, buf)
def test_days_in_months(dtype): nrows = 1000 data = dataset_generator.rand_dataframe( dtypes_meta=[{ "dtype": dtype, "null_frequency": 0.4, "cardinality": nrows }], rows=nrows, use_threads=False, seed=23, ) ps = data.to_pandas()["0"] gs = cudf.from_pandas(ps) assert_eq(ps.dt.days_in_month, gs.dt.days_in_month)
def test_avro_compression(rows, codec): schema = { "name": "root", "type": "record", "fields": [ { "name": "0", "type": "int" }, { "name": "1", "type": "string" }, ], } df = rand_dataframe( [ { "dtype": "int32", "null_frequency": 0, "cardinality": 1000 }, { "dtype": "str", "null_frequency": 0, "cardinality": 100, "max_string_length": 10, }, ], rows, ) expected_df = cudf.DataFrame.from_arrow(df) records = df.to_pandas().to_dict(orient="records") buffer = io.BytesIO() fastavro.writer(buffer, schema, records, codec=codec) buffer.seek(0) got_df = cudf.read_avro(buffer) assert_eq(expected_df, got_df)
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category"} # No unsigned support in avro: # https://avro.apache.org/docs/current/spec.html - cudf.utils.dtypes.UNSIGNED_TYPES # TODO: Remove DATETIME_TYPES once # following bug is fixed: # https://github.com/rapidsai/cudf/issues/6482 - cudf.utils.dtypes.DATETIME_TYPES # TODO: Remove DURATION_TYPES once # following bug is fixed: # https://github.com/rapidsai/cudf/issues/6604 - cudf.utils.dtypes.TIMEDELTA_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) self._df = df logging.info(f"Shape of DataFrame generated: {table.shape}") file_obj = io.BytesIO() pandas_to_avro(df, file_io_obj=file_obj) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) return (df, buf)
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES # TODO: Remove "bool" from below # list after following issue is fixed: # https://github.com/rapidsai/cudf/issues/6763 - {"category", "bool"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES # TODO: Remove `DATETIME_TYPES` once # following bug is fixed: # https://github.com/rapidsai/cudf/issues/7355 - cudf.utils.dtypes.DATETIME_TYPES ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df return df
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: seed = random.randint(0, 2 ** 32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES # https://github.com/pandas-dev/pandas/issues/20599 - {"uint64"} # TODO: Remove DATETIME_TYPES after this is fixed: # https://github.com/rapidsai/cudf/issues/6586 - set(cudf.utils.dtypes.DATETIME_TYPES) ) # TODO: Uncomment following after following # issue is fixed: # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) self._current_buffer = df logging.info(f"Shape of DataFrame generated: {df.shape}") return df.to_json(orient="records", lines=True)