コード例 #1
0
ファイル: test_factorize.py プロジェクト: vyasr/cudf
def test_factorize_index_obj(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)
    df = df.set_index("cats")

    uvals, labels = df.index.factorize()
    np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
    assert isinstance(uvals, cp.core.core.ndarray)
    assert isinstance(labels, Index)

    encoder = dict((labels[idx], idx) for idx in range(len(labels)))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.get(), handcoded)
コード例 #2
0
ファイル: test_groupby.py プロジェクト: sperlingxx/cudf
def test_groupby_column_numeral():
    pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    p = pdf.groupby(1)
    g = gdf.groupby(1)
    pxx = p[0].sum()
    gxx = g[0].sum()
    assert_groupby_results_equal(pxx, gxx)

    pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    p = pdf.groupby(1.5)
    g = gdf.groupby(1.5)
    pxx = p[0.5].sum()
    gxx = g[0.5].sum()
    assert_groupby_results_equal(pxx, gxx)
コード例 #3
0
ファイル: test_joining.py プロジェクト: wphicks/cudf
def test_index_join(lhs, rhs, how, level):
    l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]})
    r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]})
    l_df = DataFrame.from_pandas(l_pdf)
    r_df = DataFrame.from_pandas(r_pdf)
    p_lhs = l_pdf.set_index(lhs).index
    p_rhs = r_pdf.set_index(rhs).index
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index

    expected = (p_lhs.join(p_rhs, level=level, how=how).to_frame(
        index=False).sort_values(by=lhs).reset_index(drop=True))
    got = (g_lhs.join(g_rhs, level=level, how=how).to_frame(
        index=False).sort_values(by=lhs).reset_index(drop=True))

    assert_eq(expected, got)
コード例 #4
0
def test_cat_series_binop_error():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    dfa = df["a"]
    dfb = df["b"]

    # lhs is a categorical
    assert_exceptions_equal(
        lfunc=operator.add,
        rfunc=operator.add,
        lfunc_args_and_kwargs=([dfa, dfb], ),
        rfunc_args_and_kwargs=([dfa, dfb], ),
        check_exception_type=False,
        expected_error_message="Series of dtype `category` cannot "
        "perform the operation: add",
    )
    # if lhs is a numerical
    assert_exceptions_equal(
        lfunc=operator.add,
        rfunc=operator.add,
        lfunc_args_and_kwargs=([dfb, dfa], ),
        rfunc_args_and_kwargs=([dfb, dfa], ),
        check_exception_type=False,
        expected_error_message="'add' operator not supported",
    )
コード例 #5
0
ファイル: test_groupby.py プロジェクト: sperlingxx/cudf
def test_groupby_column_name():
    pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    g = gdf.groupby("yy")
    p = pdf.groupby("yy")
    gxx = g["xx"].sum()
    pxx = p["xx"].sum()
    assert_groupby_results_equal(pxx, gxx)

    gxx = g["xx"].count()
    pxx = p["xx"].count()
    assert_groupby_results_equal(pxx, gxx, check_dtype=False)

    gxx = g["xx"].min()
    pxx = p["xx"].min()
    assert_groupby_results_equal(pxx, gxx)

    gxx = g["xx"].max()
    pxx = p["xx"].max()
    assert_groupby_results_equal(pxx, gxx)

    gxx = g["xx"].idxmin()
    pxx = p["xx"].idxmin()
    assert_groupby_results_equal(pxx, gxx, check_dtype=False)

    gxx = g["xx"].idxmax()
    pxx = p["xx"].idxmax()
    assert_groupby_results_equal(pxx, gxx, check_dtype=False)

    gxx = g["xx"].mean()
    pxx = p["xx"].mean()
    assert_groupby_results_equal(pxx, gxx)
コード例 #6
0
def test_tile(nulls, num_cols, num_rows, dtype, count):

    if dtype not in ["float32", "float64"] and nulls in ["some"]:
        pytest.skip(msg="nulls not supported in dtype: " + dtype)

    pdf = pd.DataFrame(dtype=dtype)
    for i in range(num_cols):
        colname = str(i)
        data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype(
            dtype
        )

        if nulls == "some":
            idx = np.random.choice(
                num_rows, size=int(num_rows / 2), replace=False
            )
            data[idx] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.tile(count)
    expect = pd.DataFrame(pd.concat([pdf] * count))

    assert_eq(expect, got)
コード例 #7
0
ファイル: test_groupby.py プロジェクト: sperlingxx/cudf
def test_groupby_iterate_groups():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    for name, grp in df.groupby(["key1", "key2"]):
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)
コード例 #8
0
def test_dataframe_multi_column_nulls(num_cols, num_rows, dtype, nulls,
                                      ascending, na_position):

    from string import ascii_lowercase

    np.random.seed(0)
    by = list(ascii_lowercase[:num_cols])
    pdf = pd.DataFrame()

    for i in range(3):
        colname = ascii_lowercase[i]
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.array([], dtype="int64")
            if num_rows > 0:
                idx = np.random.choice(num_rows,
                                       size=int(num_rows / 4),
                                       replace=False)
            data[idx] = np.nan
        elif nulls == "all":
            data[:] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.sort_values(by, ascending=ascending, na_position=na_position)
    expect = pdf.sort_values(by, ascending=ascending, na_position=na_position)

    assert_eq(got[by].reset_index(drop=True),
              expect[by].reset_index(drop=True))
コード例 #9
0
def test_interleave_columns(nulls, num_cols, num_rows, dtype):

    if dtype not in ["float32", "float64"] and nulls in ["some"]:
        pytest.skip(msg="nulls not supported in dtype: " + dtype)

    pdf = pd.DataFrame(dtype=dtype)
    for i in range(num_cols):
        colname = str(i)
        data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype)

        if nulls == "some":
            idx = np.random.choice(num_rows,
                                   size=int(num_rows / 2),
                                   replace=False)
            data[idx] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    if dtype == "category":
        with pytest.raises(ValueError):
            assert gdf.interleave_columns()
    else:
        got = gdf.interleave_columns()

        expect = pd.Series(np.vstack(pdf.to_numpy()).reshape(
            (-1, ))).astype(dtype)

        assert_eq(expect, got)
コード例 #10
0
def test_df_stack(nulls, num_cols, num_rows, dtype):
    if dtype not in ["float32", "float64"] and nulls in ["some"]:
        pytest.skip(msg="nulls not supported in dtype: " + dtype)

    pdf = pd.DataFrame()
    for i in range(num_cols):
        colname = str(i)
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.random.choice(num_rows,
                                   size=int(num_rows / 2),
                                   replace=False)
            data[idx] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.stack()

    expect = pdf.stack()
    if {None} == set(expect.index.names):
        expect.rename_axis(list(range(0, len(expect.index.names))),
                           inplace=True)

    assert_eq(expect, got)
    pass
コード例 #11
0
ファイル: test_onehot.py プロジェクト: zivzone/cudf
def test_onehot_random():
    df = DataFrame()
    low = 10
    high = 17
    size = 10
    df["src"] = src = np.random.randint(low=low, high=high, size=size)
    df2 = df.one_hot_encoding(column="src",
                              prefix="out_",
                              cats=tuple(range(10, 17)))
    mat = df2.as_matrix(columns=df2.columns[1:])

    for val in range(low, high):
        colidx = val - low
        arr = mat[:, colidx]
        mask = src == val
        np.testing.assert_equal(arr, mask)
コード例 #12
0
def test_fillna_dataframe(df, value, inplace):
    pdf = df.copy(deep=True)
    gdf = DataFrame.from_pandas(pdf)

    fill_value_pd = value
    if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)):
        fill_value_cudf = cudf.from_pandas(fill_value_pd)
    elif isinstance(fill_value_pd, dict):
        fill_value_cudf = {}
        for key in fill_value_pd:
            temp_val = fill_value_pd[key]
            if isinstance(temp_val, pd.Series):
                temp_val = cudf.from_pandas(temp_val)
            fill_value_cudf[key] = temp_val
    else:
        fill_value_cudf = value

    expect = pdf.fillna(fill_value_pd, inplace=inplace)
    got = gdf.fillna(fill_value_cudf, inplace=inplace)

    if inplace:
        got = gdf
        expect = pdf

    assert_eq(expect, got)
コード例 #13
0
ファイル: test_onehot.py プロジェクト: zivzone/cudf
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding("fo",
                              cats=df.fo.unique(),
                              prefix="fo",
                              dtype=np.int32)
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
コード例 #14
0
ファイル: test_replace.py プロジェクト: zivzone/cudf
def test_fillna_dataframe(fill_type, inplace):
    pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]})
    gdf = DataFrame.from_pandas(pdf)

    if fill_type == "scalar":
        fill_value_pd = 5
        fill_value_cudf = fill_value_pd
    elif fill_type == "series":
        fill_value_pd = pd.Series([3, 4, 5])
        fill_value_cudf = Series.from_pandas(fill_value_pd)
    else:
        fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])}
        fill_value_cudf = {
            "a": fill_value_pd["a"],
            "b": Series.from_pandas(fill_value_pd["b"]),
        }

    # https://github.com/pandas-dev/pandas/issues/27197
    # pandas df.fill_value with series is not working

    if isinstance(fill_value_pd, pd.Series):
        expect = pd.DataFrame()
        for col in pdf.columns:
            expect[col] = pdf[col].fillna(fill_value_pd)
    else:
        expect = pdf.fillna(fill_value_pd)

    got = gdf.fillna(fill_value_cudf, inplace=inplace)

    if inplace:
        got = gdf

    assert_eq(expect, got)
コード例 #15
0
ファイル: test_datetime.py プロジェクト: zhuohuwu0603/cudf
def test_datetime_array_timeunit_cast(dtype):
    testdata = np.array(
        [
            np.datetime64("2016-11-20"),
            np.datetime64("2020-11-20"),
            np.datetime64("2019-11-20"),
            np.datetime64("1918-11-20"),
            np.datetime64("2118-11-20"),
        ],
        dtype=dtype,
    )

    gs = Series(testdata)
    ps = pd.Series(testdata)

    assert_eq(ps, gs)

    gdf = DataFrame()
    gdf["a"] = np.arange(5)
    gdf["b"] = testdata

    pdf = pd.DataFrame()
    pdf["a"] = np.arange(5)
    pdf["b"] = testdata
    assert_eq(pdf, gdf)
コード例 #16
0
ファイル: test_groupby.py プロジェクト: sperlingxx/cudf
def test_groupby_quantile(interpolation, q):
    raw_data = {
        "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9],
        "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2],
    }
    # Pandas>0.25 now casts NaN in quantile operations as a float64
    # # so we are filling with zeros.
    pdf = pd.DataFrame(raw_data).fillna(0)
    gdf = DataFrame.from_pandas(pdf)

    pdg = pdf.groupby("x")
    gdg = gdf.groupby("x")

    pdresult = pdg.quantile(q, interpolation=interpolation)
    gdresult = gdg.quantile(q, interpolation=interpolation)

    # There's a lot left to add to python bindings like index name
    # so this is a temporary workaround
    pdresult = pdresult["y"].reset_index(drop=True)
    gdresult = gdresult["y"].reset_index(drop=True)

    if q == 0.5 and interpolation == "nearest":
        pytest.xfail(
            "Pandas NaN Rounding will fail nearest interpolation at 0.5")

    assert_groupby_results_equal(pdresult, gdresult)
コード例 #17
0
def test_categorical_dataframe_slice_copy():
    pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")})
    gdf = DataFrame.from_pandas(pdf)

    exp = pdf[1:].copy()
    gdf = gdf[1:].copy()

    assert_eq(exp, gdf)
コード例 #18
0
ファイル: test_groupby.py プロジェクト: williamBlazing/cudf
def test_groupby_column_name():
    pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    g = gdf.groupby("yy")
    p = pdf.groupby("yy")
    gxx = g["xx"].sum()
    pxx = p["xx"].sum()
    assert_eq(pxx, gxx)
コード例 #19
0
ファイル: test_pickling.py プロジェクト: vuule/cudf
def test_pickle_dataframe_categorical():
    np.random.seed(0)

    df = DataFrame()
    df["keys"] = pd.Categorical("aaabababac")
    df["vals"] = np.random.random(len(df))

    check_serialization(df)
コード例 #20
0
ファイル: test_pickling.py プロジェクト: vuule/cudf
def test_pickle_dataframe_numeric():
    np.random.seed(0)
    df = DataFrame()
    nelem = 10
    df["keys"] = np.arange(nelem, dtype=np.float64)
    df["vals"] = np.random.random(nelem)

    check_serialization(df)
コード例 #21
0
ファイル: test_string.py プロジェクト: trevorsm7/cudf
def test_string_slice():
    df = DataFrame({"a": ["hello", "world"]})
    pdf = pd.DataFrame({"a": ["hello", "world"]})
    a_slice_got = df.a.str.slice(0, 2)
    a_slice_expected = pdf.a.str.slice(0, 2)

    assert isinstance(a_slice_got, Series)
    assert_eq(a_slice_expected, a_slice_got)
コード例 #22
0
def test_groupby_cats():
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = df["cats"].values_host
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for i in range(len(got_vals)):
        expect = vals[cats == got_cats[i]].mean()
        np.testing.assert_almost_equal(got_vals[i], expect)
コード例 #23
0
ファイル: test_factorize.py プロジェクト: vyasr/cudf
def test_factorize_series_index():
    df = DataFrame()
    df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"]
    df["col2"] = [
        2992443.0,
        2992447.0,
        2992466.0,
        2992440.0,
        2992441.0,
        2992442.0,
        2992444.0,
        2992445.0,
        2992446.0,
        2992448.0,
    ]
    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )

    df = df.set_index("col2")

    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )
コード例 #24
0
ファイル: test_sorting.py プロジェクト: trevorsm7/cudf
def test_dataframe_sort_values_sliced(nelem, sliceobj):
    np.random.seed(0)
    df = pd.DataFrame()
    df["a"] = np.random.random(nelem)

    expect = df[sliceobj]["a"].sort_values()
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj]["a"].sort_values()
    assert (got.to_pandas() == expect).all()
コード例 #25
0
def test_groupby_level_zero(agg):
    pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[0, 1, 1])
    gdf = DataFrame.from_pandas(pdf)
    pdg = pdf.groupby(level=0)
    gdg = gdf.groupby(level=0)
    pdresult = getattr(pdg, agg)()
    gdresult = getattr(gdg, agg)()
    check_dtype = False if agg == "count" else True
    assert_eq(pdresult, gdresult, check_dtype=check_dtype)
コード例 #26
0
ファイル: test_groupby.py プロジェクト: williamBlazing/cudf
def test_groupby_cats(method):
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = np.asarray(list(df["cats"]))
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], method=method, as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for c, v in zip(got_cats, got_vals):
        print(c, v)
        expect = vals[cats == c].mean()
        np.testing.assert_almost_equal(v, expect)
コード例 #27
0
ファイル: test_groupby.py プロジェクト: williamBlazing/cudf
def test_groupby_as_df():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    df, segs = df.groupby(["key1", "key2"], method="cudf").as_df()
    for s, e in zip(segs, list(segs[1:]) + [None]):
        grp = df[s:e]
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)
コード例 #28
0
ファイル: test_groupby.py プロジェクト: sperlingxx/cudf
def test_groupby_level_zero(agg):
    pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[2, 5, 5])
    gdf = DataFrame.from_pandas(pdf)
    pdg = pdf.groupby(level=0)
    gdg = gdf.groupby(level=0)
    pdresult = getattr(pdg, agg)()
    gdresult = getattr(gdg, agg)()
    check_dtype = False if agg in _index_type_aggs else True
    assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype)
コード例 #29
0
def test_numpy_non_contiguious():
    recdtype = np.dtype([("index", np.int64), ("a", np.int32)])
    rec = np.recarray(10, dtype=recdtype)
    rec.index = np.arange(30, 40)
    rec.a = aa = np.arange(20, dtype=np.int32)[::2]
    assert rec.a.flags["C_CONTIGUOUS"] is False

    gdf = DataFrame.from_records(rec, index="index")
    assert_eq(aa, gdf["a"].values)
コード例 #30
0
ファイル: test_string.py プロジェクト: trevorsm7/cudf
def test_string_groupby_key(str_data, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
    got = gdf.groupby(list(range(num_keys)), as_index=False).count()

    expect = expect.sort_values([0]).reset_index(drop=True)
    got = got.sort_values([0]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)