コード例 #1
0
ファイル: parquet.py プロジェクト: TravisHester/cudf
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            seed = random.randint(0, 2**32 - 1)
            random.seed(seed)
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES -
                {"category", "timedelta64[ns]", "datetime64[ns]"}
                # TODO: Remove uint32 below after this bug is fixed
                # https://github.com/pandas-dev/pandas/issues/37327
                - {"uint32"}
                | {"list", "decimal64"})
            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_columns"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")

        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)

        logging.info(f"Shape of DataFrame generated: {df.shape}")
        self._current_buffer = df
        return df
コード例 #2
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            seed = random.randint(0, 2**32 - 1)
            random.seed(seed)
            dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_columns"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)

        logging.info(f"Shape of DataFrame generated: {df.shape}")
        self._current_buffer = df
        return df.to_csv()
コード例 #3
0
ファイル: test_rolling.py プロジェクト: rongou/cudf
def test_rolling_var_std_large(agg, ddof, center, seed, window_size):
    if PANDAS_GE_110:
        kwargs = {"check_freq": False}
    else:
        kwargs = {}

    iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size)
    ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size)

    fupper_bound = math.sqrt(np.finfo(np.float64).max / window_size)
    flower_bound = -math.sqrt(abs(np.finfo(np.float64).min) / window_size)

    n_rows = 1_000
    data = rand_dataframe(
        dtypes_meta=[
            {
                "dtype": "int64",
                "null_frequency": 0.4,
                "cardinality": n_rows,
                "min_bound": ilower_bound,
                "max_bound": iupper_bound,
            },
            {
                "dtype": "float64",
                "null_frequency": 0.4,
                "cardinality": n_rows,
                "min_bound": flower_bound,
                "max_bound": fupper_bound,
            },
            {
                "dtype": "decimal64",
                "null_frequency": 0.4,
                "cardinality": n_rows,
                "min_bound": ilower_bound,
                "max_bound": iupper_bound,
            },
        ],
        rows=n_rows,
        use_threads=False,
        seed=seed,
    )
    pdf = data.to_pandas()
    gdf = cudf.from_pandas(pdf)

    expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof)
    got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof)

    import platform

    if platform.machine() == "aarch64":
        # Due to pandas-37051, pandas rolling var/std on uniform window is
        # not reliable. Skipping these rows when comparing.
        for col in expect:
            mask = (got[col].fillna(-1) != 0).to_pandas()
            expect[col] = expect[col][mask]
            got[col] = got[col][mask]
            assert_eq(expect[col], got[col], **kwargs)
    else:
        assert_eq(expect, got, **kwargs)
コード例 #4
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                - {"category", "datetime64[ns]"}
                - cudf.utils.dtypes.TIMEDELTA_TYPES
                # TODO: Remove uint32 below after this bug is fixed
                # https://github.com/pandas-dev/pandas/issues/37327
                - {"uint32"}
                | {"list", "decimal64"}
            )

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2 ** 32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")

        # TODO: Change this to write into
        # a BytesIO object once below issue is fixed
        # https://issues.apache.org/jira/browse/ARROW-10123

        # file = io.BytesIO()

        df.to_parquet("temp_file")
        # file.seek(0)
        # self._current_buffer = copy.copy(file.read())
        # return self._current_buffer
        self._df = df
        return "temp_file"
コード例 #5
0
ファイル: orc.py プロジェクト: rongou/cudf
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                - {"category"}
                # Following dtypes are not supported by orc
                # https://orc.apache.org/specification/ORCv0/
                - cudf.utils.dtypes.TIMEDELTA_TYPES
                - cudf.utils.dtypes.UNSIGNED_TYPES
                - {"datetime64[ns]"}
            )

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )

            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2 ** 32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")
        self._df = df
        file_obj = io.BytesIO()
        pandas_to_orc(
            df,
            file_io_obj=file_obj,
            stripe_size=self._rand(len(df)),
            arrow_table_schema=table.schema,
        )
        file_obj.seek(0)
        buf = file_obj.read()
        self._current_buffer = copy.copy(buf)
        return (df, buf)
コード例 #6
0
def test_days_in_months(dtype):
    nrows = 1000

    data = dataset_generator.rand_dataframe(
        dtypes_meta=[{
            "dtype": dtype,
            "null_frequency": 0.4,
            "cardinality": nrows
        }],
        rows=nrows,
        use_threads=False,
        seed=23,
    )

    ps = data.to_pandas()["0"]
    gs = cudf.from_pandas(ps)

    assert_eq(ps.dt.days_in_month, gs.dt.days_in_month)
コード例 #7
0
def test_avro_compression(rows, codec):
    schema = {
        "name":
        "root",
        "type":
        "record",
        "fields": [
            {
                "name": "0",
                "type": "int"
            },
            {
                "name": "1",
                "type": "string"
            },
        ],
    }

    df = rand_dataframe(
        [
            {
                "dtype": "int32",
                "null_frequency": 0,
                "cardinality": 1000
            },
            {
                "dtype": "str",
                "null_frequency": 0,
                "cardinality": 100,
                "max_string_length": 10,
            },
        ],
        rows,
    )
    expected_df = cudf.DataFrame.from_arrow(df)

    records = df.to_pandas().to_dict(orient="records")

    buffer = io.BytesIO()
    fastavro.writer(buffer, schema, records, codec=codec)
    buffer.seek(0)
    got_df = cudf.read_avro(buffer)

    assert_eq(expected_df, got_df)
コード例 #8
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES - {"category"}
                # No unsigned support in avro:
                # https://avro.apache.org/docs/current/spec.html
                - cudf.utils.dtypes.UNSIGNED_TYPES
                # TODO: Remove DATETIME_TYPES once
                # following bug is fixed:
                # https://github.com/rapidsai/cudf/issues/6482
                - cudf.utils.dtypes.DATETIME_TYPES
                # TODO: Remove DURATION_TYPES once
                # following bug is fixed:
                # https://github.com/rapidsai/cudf/issues/6604
                - cudf.utils.dtypes.TIMEDELTA_TYPES)

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list)
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2**32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(f"Generating DataFrame with rows: {num_rows} "
                     f"and columns: {num_cols}")
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        self._df = df
        logging.info(f"Shape of DataFrame generated: {table.shape}")

        file_obj = io.BytesIO()
        pandas_to_avro(df, file_io_obj=file_obj)
        file_obj.seek(0)
        buf = file_obj.read()
        self._current_buffer = copy.copy(buf)
        return (df, buf)
コード例 #9
0
ファイル: orc.py プロジェクト: rongou/cudf
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                # TODO: Remove "bool" from below
                # list after following issue is fixed:
                # https://github.com/rapidsai/cudf/issues/6763
                - {"category", "bool"}
                # Following dtypes are not supported by orc
                # https://orc.apache.org/specification/ORCv0/
                - cudf.utils.dtypes.TIMEDELTA_TYPES
                - cudf.utils.dtypes.UNSIGNED_TYPES
                # TODO: Remove `DATETIME_TYPES` once
                # following bug is fixed:
                # https://github.com/rapidsai/cudf/issues/7355
                - cudf.utils.dtypes.DATETIME_TYPES
            )

            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )
            self._current_params["dtypes_meta"] = dtypes_meta
            seed = random.randint(0, 2 ** 32 - 1)
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_cols"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        logging.info(f"Shape of DataFrame generated: {table.shape}")
        self._df = df
        return df
コード例 #10
0
    def generate_input(self):
        if self._regression:
            (
                dtypes_meta,
                num_rows,
                num_cols,
                seed,
            ) = self.get_next_regression_params()
        else:
            seed = random.randint(0, 2 ** 32 - 1)
            random.seed(seed)
            dtypes_list = list(
                cudf.utils.dtypes.ALL_TYPES
                # https://github.com/pandas-dev/pandas/issues/20599
                - {"uint64"}
                # TODO: Remove DATETIME_TYPES after this is fixed:
                # https://github.com/rapidsai/cudf/issues/6586
                - set(cudf.utils.dtypes.DATETIME_TYPES)
            )
            # TODO: Uncomment following after following
            # issue is fixed:
            # https://github.com/rapidsai/cudf/issues/7086
            # dtypes_list.extend(["list"])
            dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                self, dtypes_list
            )
            self._current_params["dtypes_meta"] = dtypes_meta
            self._current_params["seed"] = seed
            self._current_params["num_rows"] = num_rows
            self._current_params["num_columns"] = num_cols
        logging.info(
            f"Generating DataFrame with rows: {num_rows} "
            f"and columns: {num_cols}"
        )
        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
        df = pyarrow_to_pandas(table)
        self._current_buffer = df
        logging.info(f"Shape of DataFrame generated: {df.shape}")

        return df.to_json(orient="records", lines=True)