Ejemplo n.º 1
0
def test_categorical_binary_add():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    pdsr = pd.Series(cat)
    sr = Series(cat)

    with pytest.raises(TypeError) as raises:
        pdsr + pdsr
    raises.match("unsupported operand")

    with pytest.raises(TypeError) as raises:
        sr + sr
    raises.match(
        "Series of dtype `category` cannot perform the operation: add")
Ejemplo n.º 2
0
def test_str_to_datetime_error():
    psr = pd.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"])
    gsr = Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"])

    assert_exceptions_equal(
        lfunc=psr.astype,
        rfunc=gsr.astype,
        lfunc_args_and_kwargs=(["datetime64[s]"], ),
        rfunc_args_and_kwargs=(["datetime64[s]"], ),
        check_exception_type=False,
        expected_error_message=re.escape(
            "Could not convert `None` value to datetime"),
    )
Ejemplo n.º 3
0
def test_typecast_on_join_no_float_round():

    other_data = ["a", "b", "c", "d", "e"]

    join_data_l = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32")

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_join_data = [1, 2, 3, 4, 5]
    exp_Bx = ["a", "b", "c", "d", "e"]
    exp_By = ["a", "b", "c", None, None]
    exp_join_col = Series(exp_join_data, dtype="float32")

    expect = DataFrame(
        {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By}
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="left")

    assert_eq(expect, got)
Ejemplo n.º 4
0
def test_series_not(dtype):
    import pandas as pd

    dtype = np.dtype(dtype).type
    arr = pd.Series(np.random.choice([True, False], 1000)).astype(dtype)
    if dtype is not np.bool_:
        arr = arr * (np.random.random(1000) * 100).astype(dtype)
    sr = Series(arr)

    result = cudf.logical_not(sr).to_array()
    expect = np.logical_not(arr)
    np.testing.assert_equal(result, expect)
    np.testing.assert_equal((~sr).to_array(), ~arr)
Ejemplo n.º 5
0
def test_categorical_unary_ceil():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    pdsr = pd.Series(cat)
    sr = Series(cat)

    assert_exceptions_equal(
        lfunc=getattr,
        rfunc=sr.ceil,
        lfunc_args_and_kwargs=([pdsr, "ceil"], ),
        check_exception_type=False,
        expected_error_message="Series of dtype `category` cannot "
        "perform the operation: ceil",
    )
Ejemplo n.º 6
0
def test_categorical_empty():
    cat = pd.Categorical([])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_array())

    # Test attributes
    assert tuple(pdsr.cat.categories) == tuple(sr.cat.categories)
    assert pdsr.cat.ordered == sr.cat.ordered

    np.testing.assert_array_equal(pdsr.cat.codes.values,
                                  sr.cat.codes.to_array())
    np.testing.assert_array_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)
Ejemplo n.º 7
0
def test_categorical_binary_add():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    pdsr = pd.Series(cat)
    sr = Series(cat)

    assert_exceptions_equal(
        lfunc=operator.add,
        rfunc=operator.add,
        lfunc_args_and_kwargs=([pdsr, pdsr], ),
        rfunc_args_and_kwargs=([sr, sr], ),
        expected_error_message="Series of dtype `category` cannot perform "
        "the operation: add",
    )
Ejemplo n.º 8
0
def test_fillna():
    _, schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)
    masked_col = gar[8]
    assert masked_col.null_count
    sr = Series.from_masked_array(
        data=masked_col.data,
        mask=masked_col.null,
        null_count=masked_col.null_count,
    )
    dense = sr.fillna(123)
    np.testing.assert_equal(123, dense.to_array())
    assert len(dense) == len(sr)
    assert dense.null_count == 0
Ejemplo n.º 9
0
def test_series_reductions(method, dtype):
    np.random.seed(0)
    arr = np.random.random(100)
    if np.issubdtype(dtype, np.integer):
        arr *= 100
        mask = arr > 10
    else:
        mask = arr > 0.5

    arr = arr.astype(dtype)
    arr2 = arr[mask]
    sr = Series.from_masked_array(arr, Series(mask).as_mask())

    def call_test(sr):
        fn = getattr(sr, method)
        if method in ["std", "var"]:
            return fn(ddof=1)
        else:
            return fn()

    expect, got = call_test(arr2), call_test(sr)
    print(expect, got)
    np.testing.assert_approx_equal(expect, got)
Ejemplo n.º 10
0
def test_typecast_on_join_indexes_matching_categorical():
    join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category")
    join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str")
    other_data = [1, 2, 3, 4, 5]

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    gdf_l = gdf_l.set_index("join_col")
    gdf_r = gdf_r.set_index("join_col")

    exp_join_data = ["a", "b", "c", "d", "e"]
    exp_other_data = [1, 2, 3, 4, 5]

    expect = DataFrame({
        "join_col": exp_join_data,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })
    expect = expect.set_index("join_col")
    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)
Ejemplo n.º 11
0
def test_categorical_integer():
    cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"])
    pdsr = pd.Series(cat)
    sr = Series(cat)
    np.testing.assert_array_equal(cat.codes, sr.to_array(fillna="pandas"))
    assert sr.null_count == 2

    np.testing.assert_array_equal(
        pdsr.cat.codes.values, sr.cat.codes.fillna(-1).to_array()
    )
    np.testing.assert_equal(pdsr.cat.codes.dtype, sr.cat.codes.dtype)

    string = str(sr)
    expect_str = """
0 a
1 null
2 null
3 c
4 a
dtype: category
Categories (3, object): [a, b, c]
"""
    assert string.split() == expect_str.split()
Ejemplo n.º 12
0
def test_categorical_set_categories():
    cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"])
    psr = pd.Series(cat)
    sr = Series.from_categorical(cat)

    # adding category
    expect = psr.cat.set_categories(["a", "b", "c", "d"])
    got = sr.cat.set_categories(["a", "b", "c", "d"])
    assert_eq(expect, got)

    # removing category
    expect = psr.cat.set_categories(["a", "b"])
    got = sr.cat.set_categories(["a", "b"])
    assert_eq(expect, got)
Ejemplo n.º 13
0
def test_reflected_ops_scalar(func, dtype, obj_class):
    # create random series
    np.random.seed(12)
    random_series = utils.gen_rand(dtype, 100, low=10)

    # gpu series
    gs = Series(random_series)

    # class typing
    if obj_class == "Index":
        gs = as_index(gs)

    gs_result = func(gs)

    # class typing
    if obj_class == "Index":
        gs = Series(gs)

    # pandas
    ps_result = func(random_series)

    # verify
    np.testing.assert_allclose(ps_result, gs_result.to_array())
Ejemplo n.º 14
0
def test_to_from_pandas_nulls(data, nulls):
    pd_data = pd.Series(data.copy())
    if nulls == "some":
        # Fill half the values with NaT
        pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns")
    elif nulls == "all":
        # Fill all the values with NaT
        pd_data[:] = np.datetime64("nat", "ns")
    gdf_data = Series.from_pandas(pd_data)

    expect = pd_data
    got = gdf_data.to_pandas()

    assert_eq(expect, got)
Ejemplo n.º 15
0
def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
    other_data = ["a", "b", "c"]

    join_data_l = Series([1, 2, 3], dtype=dtype_l)
    join_data_r = Series([1, 2, 4], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    exp_join_data = [1, 2]
    exp_other_data = ["a", "b"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame({
        "join_col": exp_join_col,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
Ejemplo n.º 16
0
def test_series_where(data_dtype, fill_value):
    psr = pd.Series(list(range(10)), dtype=data_dtype)
    sr = Series.from_pandas(psr)

    expect = psr.where(psr > 0, fill_value)
    got = sr.where(sr > 0, fill_value)
    assert_eq(expect, got)

    expect = psr.where(psr < 0, fill_value)
    got = sr.where(sr < 0, fill_value)
    assert_eq(expect, got)

    expect = psr.where(psr == 0, fill_value)
    got = sr.where(sr == 0, fill_value)
    assert_eq(expect, got)
Ejemplo n.º 17
0
def test_series_nsmallest(data, n):
    """Indirectly tests Series.sort_values()
    """
    sr = Series(data)
    psr = pd.Series(data)
    assert_eq(sr.nsmallest(n), psr.nsmallest(n))
    assert_eq(
        sr.nsmallest(n, keep="last").sort_index(),
        psr.nsmallest(n, keep="last").sort_index(),
    )

    assert_exceptions_equal(
        lfunc=psr.nsmallest,
        rfunc=sr.nsmallest,
        lfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        rfunc_args_and_kwargs=([], {
            "n": 3,
            "keep": "what"
        }),
        expected_error_message='keep must be either "first", "last"',
    )
Ejemplo n.º 18
0
def test_string_wrap(data, width):
    gs = Series(data)
    ps = pd.Series(data)

    assert_eq(
        gs.str.wrap(width=width),
        ps.str.wrap(
            width=width,
            break_long_words=False,
            expand_tabs=False,
            replace_whitespace=True,
            drop_whitespace=True,
            break_on_hyphens=False,
        ),
    )
Ejemplo n.º 19
0
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding(
        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
    )
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
Ejemplo n.º 20
0
def test_series_median(dtype, num_na):
    np.random.seed(0)
    arr = np.random.random(100)
    if np.issubdtype(dtype, np.integer):
        arr *= 100
    mask = np.arange(100) >= num_na

    arr = arr.astype(dtype)
    sr = Series.from_masked_array(arr, Series(mask).as_mask())
    arr2 = arr[mask]
    ps = pd.Series(arr2, dtype=dtype)

    actual = sr.median(skipna=True)
    desired = ps.median(skipna=True)
    print(actual, desired)
    np.testing.assert_approx_equal(actual, desired)

    # only for float until integer null supported convert to pandas in cudf
    # eg. pd.Int64Dtype
    if np.issubdtype(dtype, np.floating):
        ps = sr.to_pandas()
        actual = sr.median(skipna=False)
        desired = ps.median(skipna=False)
        np.testing.assert_approx_equal(actual, desired)
Ejemplo n.º 21
0
def test_series_with_nulls_where(fill_value):
    psr = pd.Series([None] * 3 + list(range(5)))
    sr = Series.from_pandas(psr)

    expect = psr.where(psr > 0, fill_value)
    got = sr.where(sr > 0, fill_value)
    assert_eq(expect, got)

    expect = psr.where(psr < 0, fill_value)
    got = sr.where(sr < 0, fill_value)
    assert_eq(expect, got)

    expect = psr.where(psr == 0, fill_value)
    got = sr.where(sr == 0, fill_value)
    assert_eq(expect, got)
Ejemplo n.º 22
0
def test_string_groupby_key_index():
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["a"] = pd.Series(str_data, dtype="str")
    gdf["a"] = Series(str_data, dtype="str")
    pdf["b"] = other_data
    gdf["b"] = other_data

    expect = pdf.groupby("a").count()
    got = gdf.groupby("a").count()

    assert_eq(expect, got, check_dtype=False)
Ejemplo n.º 23
0
def test_series_fillna_numerical(data_dtype, fill_dtype, fill_type, null_value,
                                 inplace):
    # TODO: These tests should use Pandas' nullable int type
    # when we support a recent enough version of Pandas
    # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html

    if fill_type == "scalar":
        fill_value = np.random.randint(0, 5)
        expect = np.array([0, 1, fill_value, 2, fill_value], dtype=data_dtype)
    elif fill_type == "series":
        data = np.random.randint(0, 5, (5, ))
        fill_value = pd.Series(data, dtype=data_dtype)
        expect = np.array([0, 1, fill_value[2], 2, fill_value[4]],
                          dtype=data_dtype)

    sr = Series([0, 1, null_value, 2, null_value], dtype=data_dtype)
    result = sr.fillna(fill_value, inplace=inplace)

    if inplace:
        result = sr

    got = result.to_array()

    np.testing.assert_equal(expect, got)
Ejemplo n.º 24
0
def test_string_split(data, pat, n, expand, expand_raise):

    if data in (["a b", " c ", "   d", "e   ", "f"], ) and pat is None:
        pytest.xfail("None pattern split algorithm not implemented yet")

    ps = pd.Series(data, dtype="str")
    gs = Series(data, dtype="str")

    expectation = raise_builder([expand_raise], NotImplementedError)

    with expectation:
        expect = ps.str.split(pat=pat, n=n, expand=expand)
        got = gs.str.split(pat=pat, n=n, expand=expand)

        assert_eq(expect, got)
Ejemplo n.º 25
0
def test_datetime_scalar_timeunit_cast(timeunit):
    testscalar = np.datetime64("2016-11-20", timeunit)

    gs = Series(testscalar)
    ps = pd.Series(testscalar)
    assert_eq(ps, gs)

    gdf = DataFrame()
    gdf["a"] = np.arange(5)
    gdf["b"] = testscalar

    pdf = pd.DataFrame()
    pdf["a"] = np.arange(5)
    pdf["b"] = testscalar

    assert_eq(pdf, gdf)
Ejemplo n.º 26
0
def test_string_replace_with_backrefs(find, replace):
    s = [
        "A543",
        "Z756",
        "",
        None,
        "tést-string",
        "two-thréé four-fivé",
        "abcd-éfgh",
        "tést-string-again",
    ]
    ps = pd.Series(s)
    gs = Series(s)
    got = gs.str.replace_with_backrefs(find, replace)
    expected = ps.str.replace(find, replace, regex=True)
    assert_eq(got, expected)
Ejemplo n.º 27
0
def test_sum_masked(nelem):
    dtype = np.float64
    data = gen_rand(dtype, nelem)

    mask = utils.random_bitmask(nelem)
    bitmask = utils.expand_bits_to_bytes(mask)[:nelem]
    null_count = utils.count_zero(bitmask)

    sr = Series.from_masked_array(data, mask, null_count)

    got = sr.sum()
    res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size]
    expect = data[res_mask].sum()

    significant = 4 if dtype == np.float32 else 6
    np.testing.assert_approx_equal(expect, got, significant=significant)
Ejemplo n.º 28
0
def test_series_nsmallest(data, n):
    """Indirectly tests Series.sort_values()
    """
    sr = Series(data)
    psr = pd.Series(data)
    assert_eq(sr.nsmallest(n), psr.nsmallest(n))
    assert_eq(sr.nsmallest(n, keep="last"), psr.nsmallest(n, keep="last"))

    with pytest.raises(ValueError) as raises:
        sr.nsmallest(3, keep="what")
    assert raises.match('keep must be either "first", "last"')
Ejemplo n.º 29
0
def test_fillna_categorical(psr, fill_value, inplace):

    gsr = Series.from_pandas(psr)

    if isinstance(fill_value, pd.Series):
        fill_value_cudf = cudf.from_pandas(fill_value)
    else:
        fill_value_cudf = fill_value

    expected = psr.fillna(fill_value, inplace=inplace)
    got = gsr.fillna(fill_value_cudf, inplace=inplace)

    if inplace:
        expected = psr
        got = gsr

    assert_eq(expected, got)
Ejemplo n.º 30
0
def test_strings_rsplit(data, n, expand):
    gs = Series(data)
    ps = pd.Series(data)

    pd.testing.assert_frame_equal(
        ps.str.rsplit(n=n, expand=expand).reset_index(),
        gs.str.rsplit(n=n, expand=expand).to_pandas().reset_index(),
        check_index_type=False,
    )
    assert_eq(
        ps.str.rsplit(",", n=n, expand=expand),
        gs.str.rsplit(",", n=n, expand=expand),
    )
    assert_eq(
        ps.str.rsplit("-", n=n, expand=expand),
        gs.str.rsplit("-", n=n, expand=expand),
    )