コード例 #1
0
ファイル: test_rank.py プロジェクト: vyasr/cudf
def test_series_rank_combinations(elem, dtype):
    np.random.seed(0)
    gdf = DataFrame()
    gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
        dtype
    )
    ranked_gs = gdf["a"].rank(method="first")
    df = pd.DataFrame()
    df["a"] = aa
    ranked_ps = df["a"].rank(method="first")
    # Check
    assert_eq(ranked_ps, ranked_gs.to_pandas())
コード例 #2
0
def test_df_set_index_from_series():
    df = DataFrame()
    df["a"] = list(range(10))
    df["b"] = list(range(0, 20, 2))

    # Check set_index(Series)
    df2 = df.set_index(df["b"])
    assert list(df2.columns) == ["a", "b"]
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
コード例 #3
0
def test_dataframe_nsmallest(nelem, n):
    np.random.seed(0)
    df = DataFrame()
    df["a"] = aa = np.random.random(nelem)
    df["b"] = bb = np.random.random(nelem)
    res = df.nsmallest(n, "a")

    # Check
    inds = np.argsort(-aa)
    assert_eq(res["a"].to_array(), aa[inds][-n:][::-1])
    assert_eq(res["b"].to_array(), bb[inds][-n:][::-1])
    assert_eq(res.index.values, inds[-n:][::-1])
コード例 #4
0
def test_query_env_changing():
    df = DataFrame()
    df["a"] = aa = np.arange(100)
    expr = "a < @c"
    # first attempt
    c = 10
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
    # change env
    c = 50
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
コード例 #5
0
def test_typecast_on_join_no_float_round():

    other_data = ["a", "b", "c", "d", "e"]

    join_data_l = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32")

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_join_data = [1, 2, 3, 4, 5]
    exp_Bx = ["a", "b", "c", "d", "e"]
    exp_By = ["a", "b", "c", None, None]
    exp_join_col = Series(exp_join_data, dtype="float32")

    expect = DataFrame(
        {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By}
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="left")

    assert_eq(expect, got)
コード例 #6
0
ファイル: test_joining.py プロジェクト: wphicks/cudf
def test_index_join_exception_cases():
    l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]})
    r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]})

    # Join between two MultiIndex
    lhs = ["a", "b"]
    rhs = ["a", "c"]
    level = "a"
    how = "outer"
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index

    with pytest.raises(TypeError):
        g_lhs.join(g_rhs, level=level, how=how)

    # Improper level value, level should be an int or scalar value
    level = ["a"]
    rhs = ["a"]
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index
    with pytest.raises(ValueError):
        g_lhs.join(g_rhs, level=level, how=how)
コード例 #7
0
ファイル: onehotencoder_mg.py プロジェクト: st071300/cuML
 def _check_input_fit(self, X, is_categories=False):
     """Helper function to check input of fit within the multi-gpu model"""
     if isinstance(X, (dask.array.core.Array, cp.ndarray)):
         self._set_input_type('array')
         if is_categories:
             X = X.transpose()
         if isinstance(X, cp.ndarray):
             return DataFrame(X)
         else:
             return to_dask_cudf(X, client=self.client)
     else:
         self._set_input_type('df')
         return X
コード例 #8
0
def test_df_cat_set_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))
    got = df.set_index("a")

    pddf = df.to_pandas()
    expect = pddf.set_index("a")

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())
コード例 #9
0
def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
    # The number of digits after the decimal to use.
    decimal_digits = 2
    # The number of digits before the decimal to use.
    whole_digits = 2

    scale = 10 ** whole_digits
    nelem = num_groups * nelem_per_group

    # The unique is necessary because otherwise if there are duplicates idxmin
    # and idxmax may return different results than pandas (see
    # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to
    # the current version of the test, because idxmin and idxmax simply don't
    # work with pandas Series composed of Decimal objects (see
    # https://github.com/pandas-dev/pandas/issues/40685). However, if that is
    # ever enabled, then this issue will crop up again so we may as well have
    # it fixed now.
    x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
    y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))

    if x.size < y.size:
        total_elements = x.size
        y = y[: x.size]
    else:
        total_elements = y.size
        x = x[: y.size]

    # Note that this filtering can lead to one group with fewer elements, but
    # that shouldn't be a problem and is probably useful to test.
    idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements]

    decimal_x = pd.Series([Decimal(str(d)) for d in x])
    decimal_y = pd.Series([Decimal(str(d)) for d in y])

    pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y})
    gdf = DataFrame(
        {
            "idx": idx_col,
            "x": cudf.Series(decimal_x),
            "y": cudf.Series(decimal_y),
        }
    )

    expect_df = pdf.groupby("idx", sort=True).agg(func)
    if rmm._cuda.gpu.runtimeGetVersion() < 11000:
        with pytest.raises(RuntimeError):
            got_df = gdf.groupby("idx", sort=True).agg(func)
    else:
        got_df = gdf.groupby("idx", sort=True).agg(func)
        assert_eq(expect_df["x"], got_df["x"], check_dtype=False)
        assert_eq(expect_df["y"], got_df["y"], check_dtype=False)
コード例 #10
0
ファイル: test_index.py プロジェクト: vuule/cudf
def test_df_set_index_from_name():
    df = DataFrame()
    df["a"] = list(range(10))
    df["b"] = list(range(0, 20, 2))

    # Check set_index(column_name)
    df2 = df.set_index("b")
    print(df2)
    # 1 less column because 'b' is used as index
    assert list(df2.columns) == ["a"]
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]
コード例 #11
0
def test_dataframe_merge_order():
    gdf1 = DataFrame()
    gdf2 = DataFrame()
    gdf1["id"] = [10, 11]
    gdf1["timestamp"] = [1, 2]
    gdf1["a"] = [3, 4]

    gdf2["id"] = [4, 5]
    gdf2["a"] = [7, 8]

    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash")

    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df1["id"] = [10, 11]
    df1["timestamp"] = [1, 2]
    df1["a"] = [3, 4]

    df2["id"] = [4, 5]
    df2["a"] = [7, 8]

    df = df1.merge(df2, how="left", on=["id", "a"])
    assert_eq(gdf, df)
コード例 #12
0
def test_dataframe_sort_values_ignore_index(index, ignore_index):
    gdf = DataFrame({
        "a": [1, 3, 5, 2, 4],
        "b": [1, 1, 2, 2, 3],
        "c": [9, 7, 7, 7, 1]
    })
    gdf = gdf.set_index(index)

    pdf = gdf.to_pandas()

    expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index)
    got = gdf.sort_values((gdf.columns), ignore_index=ignore_index)

    assert_eq(expect, got)
コード例 #13
0
ファイル: test_pickling.py プロジェクト: woodworker2017/cudf
def test_sizeof_dataframe():
    np.random.seed(0)
    df = DataFrame()
    nelem = 1000
    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
    df["vals"] = hvals = np.random.random(nelem)

    nbytes = hkeys.nbytes + hvals.nbytes
    sizeof = sys.getsizeof(df)
    assert sizeof >= nbytes

    serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL))
    # Serialized size should be close to what __sizeof__ is giving
    np.testing.assert_approx_equal(sizeof, serialized_nbytes, significant=2)
コード例 #14
0
ファイル: test_joining.py プロジェクト: zhuohuwu0603/cudf
def test_typecast_on_join_indexes_matching_categorical():
    join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category")
    join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str")
    other_data = [1, 2, 3, 4, 5]

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    gdf_l = gdf_l.set_index("join_col")
    gdf_r = gdf_r.set_index("join_col")

    exp_join_data = ["a", "b", "c", "d", "e"]
    exp_other_data = [1, 2, 3, 4, 5]

    expect = DataFrame({
        "join_col": exp_join_data,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })
    expect = expect.set_index("join_col")
    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)
コード例 #15
0
ファイル: test_factorize.py プロジェクト: vyasr/cudf
def test_factorize_series_obj(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)

    uvals, labels = df["cats"].factorize()
    np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
    assert isinstance(uvals, cp.core.core.ndarray)
    assert isinstance(labels, Index)

    encoder = dict((labels[idx], idx) for idx in range(len(labels)))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.get(), handcoded)
コード例 #16
0
def test_factorize(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)

    uvals, labels = df["cats"].factorize()
    np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
    assert isinstance(uvals, Series)
    assert isinstance(labels, Series)

    encoder = dict((v, i) for i, v in enumerate(labels))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.to_array(), handcoded)
コード例 #17
0
ファイル: test_joining.py プロジェクト: sriramch/cudf
def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
    other_data = ["a", "b", "c"]

    join_data_l = Series([1, 2, 3], dtype=dtype_l)
    join_data_r = Series([1, 2, 4], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    exp_join_data = [1, 2]
    exp_other_data = ["a", "b"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame({
        "join_col": exp_join_col,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
コード例 #18
0
ファイル: test_pickling.py プロジェクト: vuule/cudf
def test_sizeof_dataframe():
    np.random.seed(0)
    df = DataFrame()
    nelem = 1000
    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
    df["vals"] = hvals = np.random.random(nelem)

    nbytes = hkeys.nbytes + hvals.nbytes
    sizeof = sys.getsizeof(df)
    assert sizeof >= nbytes

    serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL))

    # assert at least sizeof bytes were serialized
    assert serialized_nbytes >= sizeof
コード例 #19
0
ファイル: test_onehot.py プロジェクト: vyasr/cudf
def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding(
        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
    )
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
コード例 #20
0
ファイル: test_string.py プロジェクト: trevorsm7/cudf
def test_string_groupby_key_index():
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["a"] = pd.Series(str_data, dtype="str")
    gdf["a"] = Series(str_data, dtype="str")
    pdf["b"] = other_data
    gdf["b"] = other_data

    expect = pdf.groupby("a").count()
    got = gdf.groupby("a").count()

    assert_eq(expect, got, check_dtype=False)
コード例 #21
0
def test_groupby_iterate_groups():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    for name, grp in df.groupby(["key1", "key2"]):
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)
コード例 #22
0
ファイル: test_onehot.py プロジェクト: zivzone/cudf
def test_onehot_random():
    df = DataFrame()
    low = 10
    high = 17
    size = 10
    df["src"] = src = np.random.randint(low=low, high=high, size=size)
    df2 = df.one_hot_encoding(column="src",
                              prefix="out_",
                              cats=tuple(range(10, 17)))
    mat = df2.as_matrix(columns=df2.columns[1:])

    for val in range(low, high):
        colidx = val - low
        arr = mat[:, colidx]
        mask = src == val
        np.testing.assert_equal(arr, mask)
コード例 #23
0
ファイル: test_datetime.py プロジェクト: zhuohuwu0603/cudf
def test_datetime_scalar_timeunit_cast(timeunit):
    testscalar = np.datetime64("2016-11-20", timeunit)

    gs = Series(testscalar)
    ps = pd.Series(testscalar)
    assert_eq(ps, gs)

    gdf = DataFrame()
    gdf["a"] = np.arange(5)
    gdf["b"] = testscalar

    pdf = pd.DataFrame()
    pdf["a"] = np.arange(5)
    pdf["b"] = testscalar

    assert_eq(pdf, gdf)
コード例 #24
0
def test_cat_series_binop_error():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    dfa = df["a"]
    dfb = df["b"]

    # lhs is a categorical
    with pytest.raises(TypeError) as raises:
        dfa + dfb
    raises.match(
        "Series of dtype `category` cannot perform the operation: add")
    # if lhs is a numerical
    with pytest.raises(TypeError) as raises:
        dfb + dfa
    raises.match("'add' operator not supported")
コード例 #25
0
def test_groupby_cats():
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = df["cats"].values_host
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for i in range(len(got_vals)):
        expect = vals[cats == got_cats[i]].mean()
        np.testing.assert_almost_equal(got_vals[i], expect)
コード例 #26
0
ファイル: test_query.py プロジェクト: vyasr/cudf
def test_query_ref_env(data, fn):
    # prepare
    nelem, seed = data
    expect_fn, query_expr = fn
    np.random.seed(seed)
    df = DataFrame()
    df["a"] = aa = np.arange(nelem)
    df["b"] = bb = np.random.random(nelem) * nelem
    c = 2.3
    d = 1.2
    # udt
    expect_mask = expect_fn(aa, bb, c, d)
    print(expect_mask)
    df2 = df.query(query_expr)
    # check
    assert len(df2) == np.count_nonzero(expect_mask)
    np.testing.assert_array_almost_equal(df2["a"].to_array(), aa[expect_mask])
    np.testing.assert_array_almost_equal(df2["b"].to_array(), bb[expect_mask])
コード例 #27
0
ファイル: test_groupby.py プロジェクト: williamBlazing/cudf
def test_groupby_cats(method):
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = np.asarray(list(df["cats"]))
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], method=method, as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for c, v in zip(got_cats, got_vals):
        print(c, v)
        expect = vals[cats == c].mean()
        np.testing.assert_almost_equal(v, expect)
コード例 #28
0
ファイル: test_groupby.py プロジェクト: williamBlazing/cudf
def test_groupby_as_df():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    df, segs = df.groupby(["key1", "key2"], method="cudf").as_df()
    for s, e in zip(segs, list(segs[1:]) + [None]):
        grp = df[s:e]
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)
コード例 #29
0
ファイル: test_onehot.py プロジェクト: zivzone/cudf
def test_get_dummies_prefix_sep(prefix, prefix_sep):
    data = {
        "first": ["1", "2", "3"],
        "second": ["abc", "def", "ghi"],
        "third": ["ji", "ji", "ji"],
    }

    gdf = DataFrame(data)
    pdf = pd.DataFrame(data)

    encoded_expected = pd.get_dummies(pdf,
                                      prefix=prefix,
                                      prefix_sep=prefix_sep)
    encoded_actual = cudf.get_dummies(gdf,
                                      prefix=prefix,
                                      prefix_sep=prefix_sep)

    utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
コード例 #30
0
ファイル: test_string.py プロジェクト: trevorsm7/cudf
def test_string_groupby_key(str_data, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
    got = gdf.groupby(list(range(num_keys)), as_index=False).count()

    expect = expect.sort_values([0]).reset_index(drop=True)
    got = got.sort_values([0]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)