コード例 #1
0
def test_iloc_decimal():
    sr = cudf.Series(["1.00", "2.00", "3.00",
                      "4.00"]).astype(cudf.Decimal64Dtype(scale=2,
                                                          precision=3))
    got = sr.iloc[[3, 2, 1, 0]]
    expect = cudf.Series(["4.00", "3.00", "2.00", "1.00"], ).astype(
        cudf.Decimal64Dtype(scale=2, precision=3))
    assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True))
コード例 #2
0
ファイル: dtypes.py プロジェクト: sperlingxx/cudf
def _decimal_normalize_types(*args):
    s = max([a.dtype.scale for a in args])
    lhs = max([a.dtype.precision - a.dtype.scale for a in args])
    p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs)
    dtype = cudf.Decimal64Dtype(p, s)

    return [a.astype(dtype) for a in args]
コード例 #3
0
def _find_common_type_decimal(dtypes):
    # Find the largest scale and the largest difference between
    # precision and scale of the columns to be concatenated
    s = max([dtype.scale for dtype in dtypes])
    lhs = max([dtype.precision - dtype.scale for dtype in dtypes])
    # Combine to get the necessary precision and clip at the maximum
    # precision
    p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs)
    return cudf.Decimal64Dtype(p, s)
コード例 #4
0
def test_empty_columns():
    buffer = BytesIO()
    # string and decimal columns have additional steps that need to be skipped
    expected = cudf.DataFrame({
        "string":
        cudf.Series([], dtype="str"),
        "decimal":
        cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
    })
    expected.to_orc(buffer, compression="snappy")

    got_df = cudf.read_orc(buffer)
    assert_eq(expected, got_df)
コード例 #5
0
ファイル: dtypes.py プロジェクト: rongou/cudf
def _find_common_type_decimal(dtypes):
    # Find the largest scale and the largest difference between
    # precision and scale of the columns to be concatenated
    s = max(dtype.scale for dtype in dtypes)
    lhs = max(dtype.precision - dtype.scale for dtype in dtypes)
    # Combine to get the necessary precision and clip at the maximum
    # precision
    p = s + lhs

    if p > cudf.Decimal64Dtype.MAX_PRECISION:
        return cudf.Decimal128Dtype(min(cudf.Decimal128Dtype.MAX_PRECISION, p),
                                    s)
    elif p > cudf.Decimal32Dtype.MAX_PRECISION:
        return cudf.Decimal64Dtype(min(cudf.Decimal64Dtype.MAX_PRECISION, p),
                                   s)
    else:
        return cudf.Decimal32Dtype(min(cudf.Decimal32Dtype.MAX_PRECISION, p),
                                   s)
コード例 #6
0
import pandas as pd
import pyarrow as pa
import pytest

import cudf
from cudf import Scalar as pycudf_scalar
from cudf._lib.copying import get_element
from cudf.testing._utils import (
    ALL_TYPES,
    DATETIME_TYPES,
    NUMERIC_TYPES,
    TIMEDELTA_TYPES,
)

TEST_DECIMAL_TYPES = [
    cudf.Decimal64Dtype(1, 1),
    cudf.Decimal64Dtype(4, 2),
    cudf.Decimal64Dtype(4, -2),
]

SCALAR_VALUES = [
    0,
    -1,
    42,
    0.0,
    1.0,
    np.int8(0),
    np.int8(1),
    np.int8(-1),
    np.iinfo(np.int8).min,
    np.iinfo(np.int8).max,
コード例 #7
0
        if op == "ne":
            expect_all = True
        else:
            expect_all = False
        assert (result == expect_all).all()
    elif dtype in DATETIME_TYPES & TIMEDELTA_TYPES:
        assert result._column.null_count == len(data)


@pytest.mark.parametrize(
    "args",
    [
        (
            operator.add,
            ["1.5", "2.0"],
            cudf.Decimal64Dtype(scale=2, precision=2),
            ["1.5", "2.0"],
            cudf.Decimal64Dtype(scale=2, precision=2),
            ["3.0", "4.0"],
            cudf.Decimal64Dtype(scale=2, precision=3),
        ),
        (
            operator.add,
            ["1.5", "2.0"],
            cudf.Decimal64Dtype(scale=2, precision=2),
            ["2.25", "1.005"],
            cudf.Decimal64Dtype(scale=3, precision=4),
            ["3.75", "3.005"],
            cudf.Decimal64Dtype(scale=3, precision=5),
        ),
        (
コード例 #8
0
def rand_dataframe(dtypes_meta,
                   rows,
                   seed=random.randint(0, 2**32 - 1),
                   use_threads=True):
    """
    Generates a random table.

    Parameters
    ----------
    dtypes_meta : List of dict
        Specifies list of dtype meta data. dtype meta data should
        be a dictionary of the form example:
            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}
        `"str"` dtype can contain an extra key `max_string_length` to
        control the maximum size of the strings being generated in each row.
        If not specified, it will default to 1000.
    rows : int
        Specifies the number of rows to be generated.
    seed : int
        Specifies the `seed` value to be utilized by all downstream
        random data generation APIs.
    use_threads : bool
        Indicates whether to use threads pools to build the columns

    Returns
    -------
    PyArrow Table
        A Table with columns of corresponding dtypes mentioned in `dtypes_meta`
    """
    # Apply seed
    random.seed(seed)
    np.random.seed(seed)
    mimesis.random.random.seed(seed)

    column_params = []
    for meta in dtypes_meta:
        dtype = copy.deepcopy(meta["dtype"])
        null_frequency = copy.deepcopy(meta["null_frequency"])
        cardinality = copy.deepcopy(meta["cardinality"])

        if dtype == "list":
            lists_max_length = meta["lists_max_length"]
            nesting_max_depth = meta["nesting_max_depth"]
            value_type = meta["value_type"]
            nesting_depth = np.random.randint(1, nesting_max_depth)

            dtype = cudf.core.dtypes.ListDtype(value_type)

            # Determining the `dtype` from the `value_type`
            # and the nesting_depth
            i = 1
            while i < nesting_depth:
                dtype = cudf.core.dtypes.ListDtype(dtype)
                i += 1

            column_params.append(
                ColumnParameters(
                    cardinality=cardinality,
                    null_frequency=null_frequency,
                    generator=list_generator(
                        dtype=value_type,
                        size=cardinality,
                        nesting_depth=nesting_depth,
                        lists_max_length=lists_max_length,
                    ),
                    is_sorted=False,
                    dtype=dtype,
                ))
        elif dtype == "decimal64":
            max_precision = meta.get("max_precision",
                                     cudf.Decimal64Dtype.MAX_PRECISION)
            precision = np.random.randint(1, max_precision)
            scale = np.random.randint(0, precision)
            dtype = cudf.Decimal64Dtype(precision=precision, scale=scale)
            column_params.append(
                ColumnParameters(
                    cardinality=cardinality,
                    null_frequency=null_frequency,
                    generator=decimal_generator(dtype=dtype, size=cardinality),
                    is_sorted=False,
                    dtype=dtype,
                ))
        elif dtype == "category":
            column_params.append(
                ColumnParameters(
                    cardinality=cardinality,
                    null_frequency=null_frequency,
                    generator=lambda cardinality=cardinality: [
                        mimesis.random.random.randstr(unique=True, length=2000)
                        for _ in range(cardinality)
                    ],
                    is_sorted=False,
                    dtype="category",
                ))
        else:
            dtype = cudf.dtype(dtype)
            if dtype.kind in ("i", "u"):
                column_params.append(
                    ColumnParameters(
                        cardinality=cardinality,
                        null_frequency=null_frequency,
                        generator=int_generator(dtype=dtype, size=cardinality),
                        is_sorted=False,
                        dtype=dtype,
                    ))
            elif dtype.kind == "f":
                column_params.append(
                    ColumnParameters(
                        cardinality=cardinality,
                        null_frequency=null_frequency,
                        generator=float_generator(dtype=dtype,
                                                  size=cardinality),
                        is_sorted=False,
                        dtype=dtype,
                    ))
            elif dtype.kind in ("U", "O"):
                column_params.append(
                    ColumnParameters(
                        cardinality=cardinality,
                        null_frequency=null_frequency,
                        generator=lambda cardinality=cardinality: [
                            mimesis.random.random.schoice(
                                string.printable,
                                meta.get("max_string_length", 1000),
                            ) for _ in range(cardinality)
                        ],
                        is_sorted=False,
                        dtype=dtype,
                    ))
            elif dtype.kind == "M":
                column_params.append(
                    ColumnParameters(
                        cardinality=cardinality,
                        null_frequency=null_frequency,
                        generator=datetime_generator(dtype=dtype,
                                                     size=cardinality),
                        is_sorted=False,
                        dtype=cudf.dtype(dtype),
                    ))
            elif dtype.kind == "m":
                column_params.append(
                    ColumnParameters(
                        cardinality=cardinality,
                        null_frequency=null_frequency,
                        generator=timedelta_generator(dtype=dtype,
                                                      size=cardinality),
                        is_sorted=False,
                        dtype=cudf.dtype(dtype),
                    ))
            elif dtype.kind == "b":
                column_params.append(
                    ColumnParameters(
                        cardinality=cardinality,
                        null_frequency=null_frequency,
                        generator=boolean_generator(cardinality),
                        is_sorted=False,
                        dtype=cudf.dtype(dtype),
                    ))
            else:
                raise TypeError(f"Unsupported dtype: {dtype}")
            # TODO: Add List column support once
            # https://github.com/rapidsai/cudf/pull/6075
            # is merged.

    df = get_dataframe(
        Parameters(
            num_rows=rows,
            column_parameters=column_params,
            seed=seed,
        ),
        use_threads=use_threads,
    )

    return df
コード例 #9
0
 (pd.Series(dtype="category"), True),
 (pd.Series(dtype="object"), False),
 # cuDF dtypes.
 (cudf.CategoricalDtype, True),
 (cudf.ListDtype, False),
 (cudf.StructDtype, False),
 (cudf.Decimal128Dtype, False),
 (cudf.Decimal64Dtype, False),
 (cudf.Decimal32Dtype, False),
 (cudf.IntervalDtype, False),
 # cuDF dtype instances.
 (cudf.CategoricalDtype("a"), True),
 (cudf.ListDtype(int), False),
 (cudf.StructDtype({"a": int}), False),
 (cudf.Decimal128Dtype(5, 2), False),
 (cudf.Decimal64Dtype(5, 2), False),
 (cudf.Decimal32Dtype(5, 2), False),
 (cudf.IntervalDtype(int), False),
 # cuDF objects
 (cudf.Series(dtype="bool"), False),
 (cudf.Series(dtype="int"), False),
 (cudf.Series(dtype="float"), False),
 (cudf.Series(dtype="str"), False),
 (cudf.Series(dtype="datetime64[s]"), False),
 (cudf.Series(dtype="timedelta64[s]"), False),
 (cudf.Series(dtype="category"), True),
 (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
 (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
 (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
 # TODO: Currently creating an empty Series of list type ignores the
 # provided type and instead makes a float64 Series.
コード例 #10
0
ファイル: test_serialize.py プロジェクト: sperlingxx/cudf
            "a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]
        },
    ],
)
def test_serialize_list_columns(data):
    df = cudf.DataFrame(data)
    recreated = df.__class__.deserialize(*df.serialize())
    assert_eq(recreated, df)


@pytest.mark.parametrize(
    "data",
    [
        {
            "a": _decimal_series(["1", "2", "3"],
                                 dtype=cudf.Decimal64Dtype(1, 0))
        },
        {
            "a":
            _decimal_series(["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)),
            "b":
            _decimal_series(["1.0", "2.0", "3.0"],
                            dtype=cudf.Decimal64Dtype(2, 1)),
            "c":
            _decimal_series(["10.1", "20.2", "30.3"],
                            dtype=cudf.Decimal64Dtype(3, 1)),
        },
        {
            "a":
            _decimal_series(["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)),
            "b":
コード例 #11
0
ファイル: test_decimal.py プロジェクト: TravisHester/cudf
def test_decimal_invalid_precision():
    with pytest.raises(pa.ArrowInvalid):
        _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2))

    with pytest.raises(pa.ArrowInvalid):
        _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))
コード例 #12
0
ファイル: test_decimal.py プロジェクト: TravisHester/cudf
@pytest.mark.parametrize(
    "input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]]
)
def test_series_construction_with_nulls(input_obj):
    expect = pa.array(input_obj, from_pandas=True)
    got = cudf.Series(input_obj).to_arrow()

    assert expect == got


@pytest.mark.parametrize(
    "data",
    [
        {
            "a": _decimal_series(
                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
            )
        },
        {
            "a": _decimal_series(
                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
            ),
            "b": _decimal_series(
                ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
            ),
            "c": _decimal_series(
                ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
            ),
        },
        {
            "a": _decimal_series(
コード例 #13
0
ファイル: test_udf_masked_ops.py プロジェクト: rongou/cudf
        },
    ],
)
def test_masked_udf_subset_selection(data):
    def func(row):
        return row["a"] + row["b"]

    data = cudf.DataFrame(data)
    run_masked_udf_test(func, data)


@pytest.mark.parametrize(
    "unsupported_col",
    [
        ["a", "b", "c"],
        _decimal_series(["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2,
                                                                         1)),
        cudf.Series([1, 2, 3], dtype="category"),
        cudf.interval_range(start=0, end=3, closed=True),
        [[1, 2], [3, 4], [5, 6]],
        [{
            "a": 1
        }, {
            "a": 2
        }, {
            "a": 3
        }],
    ],
)
def test_masked_udf_unsupported_dtype(unsupported_col):
    data = cudf.DataFrame()
    data["unsupported_col"] = unsupported_col
コード例 #14
0
def test_decimal_column_slicing(offset, size, precision, scale):
    col = cudf.core.column.as_column(pd.Series(np.random.rand(1000)))
    col = col.astype(cudf.Decimal64Dtype(precision, scale))
    column_slicing_test(col, offset, size, True)
コード例 #15
0
import numpy as np
import pandas as pd
import pytest

import cudf
from cudf import Scalar as pycudf_scalar
from cudf._lib.copying import get_element
from cudf.tests.utils import (
    ALL_TYPES,
    DATETIME_TYPES,
    NUMERIC_TYPES,
    TIMEDELTA_TYPES,
)

TEST_DECIMAL_TYPES = [
    cudf.Decimal64Dtype(1, 1),
    cudf.Decimal64Dtype(4, 2),
    cudf.Decimal64Dtype(4, -2),
]

SCALAR_VALUES = [
    0,
    -1,
    42,
    0.0,
    1.0,
    np.int8(0),
    np.int8(1),
    np.int8(-1),
    np.iinfo(np.int8).min,
    np.iinfo(np.int8).max,