def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {df.shape}") self._current_buffer = df return df.to_csv()
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category", "timedelta64[ns]", "datetime64[ns]"} # TODO: Remove uint32 below after this bug is fixed # https://github.com/pandas-dev/pandas/issues/37327 - {"uint32"} | {"list", "decimal64"}) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {df.shape}") self._current_buffer = df return df
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: seed = random.randint(0, 2**32 - 1) random.seed(seed) dtypes_list = list( cudf.utils.dtypes.ALL_TYPES # https://github.com/pandas-dev/pandas/issues/20599 - {"uint64"} # TODO: Remove DATETIME_TYPES after this is fixed: # https://github.com/rapidsai/cudf/issues/6586 - set(cudf.utils.dtypes.DATETIME_TYPES)) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_columns"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) self._current_buffer = df logging.info(f"Shape of DataFrame generated: {df.shape}") return df.to_json(orient="records", lines=True)
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df return df
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category", "datetime64[ns]"} - cudf.utils.dtypes.TIMEDELTA_TYPES # TODO: Remove uint32 below after this bug is fixed # https://github.com/pandas-dev/pandas/issues/37327 - {"uint32"} | {"list", "decimal64"} ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") # TODO: Change this to write into # a BytesIO object once below issue is fixed # https://issues.apache.org/jira/browse/ARROW-10123 # file = io.BytesIO() df.to_parquet("temp_file") # file.seek(0) # self._current_buffer = copy.copy(file.read()) # return self._current_buffer self._df = df return "temp_file"
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES - {"datetime64[ns]"} ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df file_obj = io.BytesIO() pandas_to_orc( df, file_io_obj=file_obj, stripe_size=self._rand(len(df)), arrow_table_schema=table.schema, ) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) return (df, buf)
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES - {"category"} # No unsigned support in avro: # https://avro.apache.org/docs/current/spec.html - cudf.utils.dtypes.UNSIGNED_TYPES # TODO: Remove DATETIME_TYPES once # following bug is fixed: # https://github.com/rapidsai/cudf/issues/6482 - cudf.utils.dtypes.DATETIME_TYPES # TODO: Remove DURATION_TYPES once # following bug is fixed: # https://github.com/rapidsai/cudf/issues/6604 - cudf.utils.dtypes.TIMEDELTA_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info(f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}") table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) self._df = df logging.info(f"Shape of DataFrame generated: {table.shape}") file_obj = io.BytesIO() pandas_to_avro(df, file_io_obj=file_obj) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) return (df, buf)
def generate_input(self): if self._regression: ( dtypes_meta, num_rows, num_cols, seed, ) = self.get_next_regression_params() else: dtypes_list = list( cudf.utils.dtypes.ALL_TYPES # TODO: Remove "bool" from below # list after following issue is fixed: # https://github.com/rapidsai/cudf/issues/6763 - {"category", "bool"} # Following dtypes are not supported by orc # https://orc.apache.org/specification/ORCv0/ - cudf.utils.dtypes.TIMEDELTA_TYPES - cudf.utils.dtypes.UNSIGNED_TYPES # TODO: Remove `DATETIME_TYPES` once # following bug is fixed: # https://github.com/rapidsai/cudf/issues/7355 - cudf.utils.dtypes.DATETIME_TYPES ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( self, dtypes_list ) self._current_params["dtypes_meta"] = dtypes_meta seed = random.randint(0, 2 ** 32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols logging.info( f"Generating DataFrame with rows: {num_rows} " f"and columns: {num_cols}" ) table = dg.rand_dataframe(dtypes_meta, num_rows, seed) df = pyarrow_to_pandas(table) logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df return df