Python DataFrame.DataFrameの例

プログラミング言語: Python

名前空間/パッケージ名: cudf.core

クラス/型: DataFrame

メソッド/関数: DataFrame

hotexamples.comのコード掲載数: 30

python cudf.core.DataFrame.DataFrameは、cudf（GPU DataFrames）ライブラリの一部であり、PandasのDataFrameオブジェクトに類似したデータ構造を提供します。cudfは、GPU上で高速なデータ処理を実現するために設計されており、大規模なデータセットに対しても効率的な操作を行うことができます。DataFrameオブジェクトは、テーブル形式のデータを保持し、異なるデータ型や欠損値をサポートします。さらに、DataFrameオブジェクトには、データのフィルタリング、集計、結合、変換など、さまざまな操作を実行するための便利なメソッドや関数が備わっています。

Python DataFrame.DataFrame - 30件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのcudf.core.DataFrame.DataFrameの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

from_pandas(30)

DataFrame(30)

to_pandas(18)

set_index(16)

merge(15)

groupby(12)

index(5)

_from_table(4)

one_hot_encoding(4)

query(3)

_concat(3)

join(3)

label_encoding(3)

from_records(3)

columns(3)

head(2)

from_gpu_matrix(2)

scatter_by_map(2)

sort_values(2)

to_records(2)

nlargest(1)

nsmallest(1)

from_arrow(1)

copy(1)

replace(1)

sort_index(1)

to_arrow(1)

clip(1)

コード例 #1

ファイルを表示

ファイル: test_rank.py プロジェクト: vyasr/cudf

def test_series_rank_combinations(elem, dtype):
    np.random.seed(0)
    gdf = DataFrame()
    gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
        dtype
    )
    ranked_gs = gdf["a"].rank(method="first")
    df = pd.DataFrame()
    df["a"] = aa
    ranked_ps = df["a"].rank(method="first")
    # Check
    assert_eq(ranked_ps, ranked_gs.to_pandas())

コード例 #2

ファイルを表示

def test_df_set_index_from_series():
    df = DataFrame()
    df["a"] = list(range(10))
    df["b"] = list(range(0, 20, 2))

    # Check set_index(Series)
    df2 = df.set_index(df["b"])
    assert list(df2.columns) == ["a", "b"]
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]

コード例 #3

ファイルを表示

def test_dataframe_nsmallest(nelem, n):
    np.random.seed(0)
    df = DataFrame()
    df["a"] = aa = np.random.random(nelem)
    df["b"] = bb = np.random.random(nelem)
    res = df.nsmallest(n, "a")

    # Check
    inds = np.argsort(-aa)
    assert_eq(res["a"].to_array(), aa[inds][-n:][::-1])
    assert_eq(res["b"].to_array(), bb[inds][-n:][::-1])
    assert_eq(res.index.values, inds[-n:][::-1])

コード例 #4

ファイルを表示

def test_query_env_changing():
    df = DataFrame()
    df["a"] = aa = np.arange(100)
    expr = "a < @c"
    # first attempt
    c = 10
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())
    # change env
    c = 50
    got = df.query(expr)
    np.testing.assert_array_equal(aa[aa < c], got["a"].to_array())

コード例 #5

ファイルを表示

def test_typecast_on_join_no_float_round():

    other_data = ["a", "b", "c", "d", "e"]

    join_data_l = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32")

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_join_data = [1, 2, 3, 4, 5]
    exp_Bx = ["a", "b", "c", "d", "e"]
    exp_By = ["a", "b", "c", None, None]
    exp_join_col = Series(exp_join_data, dtype="float32")

    expect = DataFrame(
        {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By}
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="left")

    assert_eq(expect, got)

コード例 #6

ファイルを表示

ファイル: test_joining.py プロジェクト: wphicks/cudf

def test_index_join_exception_cases():
    l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]})
    r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]})

    # Join between two MultiIndex
    lhs = ["a", "b"]
    rhs = ["a", "c"]
    level = "a"
    how = "outer"
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index

    with pytest.raises(TypeError):
        g_lhs.join(g_rhs, level=level, how=how)

    # Improper level value, level should be an int or scalar value
    level = ["a"]
    rhs = ["a"]
    g_lhs = l_df.set_index(lhs).index
    g_rhs = r_df.set_index(rhs).index
    with pytest.raises(ValueError):
        g_lhs.join(g_rhs, level=level, how=how)

コード例 #7

ファイルを表示

ファイル: onehotencoder_mg.py プロジェクト: st071300/cuML

 def _check_input_fit(self, X, is_categories=False):
     """Helper function to check input of fit within the multi-gpu model"""
     if isinstance(X, (dask.array.core.Array, cp.ndarray)):
         self._set_input_type('array')
         if is_categories:
             X = X.transpose()
         if isinstance(X, cp.ndarray):
             return DataFrame(X)
         else:
             return to_dask_cudf(X, client=self.client)
     else:
         self._set_input_type('df')
         return X

コード例 #8

ファイルを表示

ファイル: test_categorical.py プロジェクト: woodworker2017/cudf

def test_df_cat_set_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))
    got = df.set_index("a")

    pddf = df.to_pandas()
    expect = pddf.set_index("a")

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())

コード例 #9

ファイルを表示

def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
    # The number of digits after the decimal to use.
    decimal_digits = 2
    # The number of digits before the decimal to use.
    whole_digits = 2

    scale = 10 ** whole_digits
    nelem = num_groups * nelem_per_group

    # The unique is necessary because otherwise if there are duplicates idxmin
    # and idxmax may return different results than pandas (see
    # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to
    # the current version of the test, because idxmin and idxmax simply don't
    # work with pandas Series composed of Decimal objects (see
    # https://github.com/pandas-dev/pandas/issues/40685). However, if that is
    # ever enabled, then this issue will crop up again so we may as well have
    # it fixed now.
    x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))
    y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits))

    if x.size < y.size:
        total_elements = x.size
        y = y[: x.size]
    else:
        total_elements = y.size
        x = x[: y.size]

    # Note that this filtering can lead to one group with fewer elements, but
    # that shouldn't be a problem and is probably useful to test.
    idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements]

    decimal_x = pd.Series([Decimal(str(d)) for d in x])
    decimal_y = pd.Series([Decimal(str(d)) for d in y])

    pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y})
    gdf = DataFrame(
        {
            "idx": idx_col,
            "x": cudf.Series(decimal_x),
            "y": cudf.Series(decimal_y),
        }
    )

    expect_df = pdf.groupby("idx", sort=True).agg(func)
    if rmm._cuda.gpu.runtimeGetVersion() < 11000:
        with pytest.raises(RuntimeError):
            got_df = gdf.groupby("idx", sort=True).agg(func)
    else:
        got_df = gdf.groupby("idx", sort=True).agg(func)
        assert_eq(expect_df["x"], got_df["x"], check_dtype=False)
        assert_eq(expect_df["y"], got_df["y"], check_dtype=False)

コード例 #10

ファイルを表示

ファイル: test_index.py プロジェクト: vuule/cudf

def test_df_set_index_from_name():
    df = DataFrame()
    df["a"] = list(range(10))
    df["b"] = list(range(0, 20, 2))

    # Check set_index(column_name)
    df2 = df.set_index("b")
    print(df2)
    # 1 less column because 'b' is used as index
    assert list(df2.columns) == ["a"]
    sliced_strided = df2.loc[2:6]
    print(sliced_strided)
    assert len(sliced_strided) == 3
    assert list(sliced_strided.index.values) == [2, 4, 6]

コード例 #11

ファイルを表示

def test_dataframe_merge_order():
    gdf1 = DataFrame()
    gdf2 = DataFrame()
    gdf1["id"] = [10, 11]
    gdf1["timestamp"] = [1, 2]
    gdf1["a"] = [3, 4]

    gdf2["id"] = [4, 5]
    gdf2["a"] = [7, 8]

    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash")

    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df1["id"] = [10, 11]
    df1["timestamp"] = [1, 2]
    df1["a"] = [3, 4]

    df2["id"] = [4, 5]
    df2["a"] = [7, 8]

    df = df1.merge(df2, how="left", on=["id", "a"])
    assert_eq(gdf, df)

コード例 #12

ファイルを表示

def test_dataframe_sort_values_ignore_index(index, ignore_index):
    gdf = DataFrame({
        "a": [1, 3, 5, 2, 4],
        "b": [1, 1, 2, 2, 3],
        "c": [9, 7, 7, 7, 1]
    })
    gdf = gdf.set_index(index)

    pdf = gdf.to_pandas()

    expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index)
    got = gdf.sort_values((gdf.columns), ignore_index=ignore_index)

    assert_eq(expect, got)

コード例 #13

ファイルを表示

ファイル: test_pickling.py プロジェクト: woodworker2017/cudf

def test_sizeof_dataframe():
    np.random.seed(0)
    df = DataFrame()
    nelem = 1000
    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
    df["vals"] = hvals = np.random.random(nelem)

    nbytes = hkeys.nbytes + hvals.nbytes
    sizeof = sys.getsizeof(df)
    assert sizeof >= nbytes

    serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL))
    # Serialized size should be close to what __sizeof__ is giving
    np.testing.assert_approx_equal(sizeof, serialized_nbytes, significant=2)

コード例 #14

ファイルを表示

ファイル: test_joining.py プロジェクト: zhuohuwu0603/cudf

def test_typecast_on_join_indexes_matching_categorical():
    join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category")
    join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str")
    other_data = [1, 2, 3, 4, 5]

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    gdf_l = gdf_l.set_index("join_col")
    gdf_r = gdf_r.set_index("join_col")

    exp_join_data = ["a", "b", "c", "d", "e"]
    exp_other_data = [1, 2, 3, 4, 5]

    expect = DataFrame({
        "join_col": exp_join_data,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })
    expect = expect.set_index("join_col")
    got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")

    assert_eq(expect, got)

コード例 #15

ファイルを表示

ファイル: test_factorize.py プロジェクト: vyasr/cudf

def test_factorize_series_obj(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)

    uvals, labels = df["cats"].factorize()
    np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
    assert isinstance(uvals, cp.core.core.ndarray)
    assert isinstance(labels, Index)

    encoder = dict((labels[idx], idx) for idx in range(len(labels)))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.get(), handcoded)

コード例 #16

ファイルを表示

def test_factorize(ncats, nelem):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32)

    uvals, labels = df["cats"].factorize()
    np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
    assert isinstance(uvals, Series)
    assert isinstance(labels, Series)

    encoder = dict((v, i) for i, v in enumerate(labels))
    handcoded = [encoder[v] for v in arr]
    np.testing.assert_array_equal(uvals.to_array(), handcoded)

コード例 #17

ファイルを表示

ファイル: test_joining.py プロジェクト: sriramch/cudf

def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
    other_data = ["a", "b", "c"]

    join_data_l = Series([1, 2, 3], dtype=dtype_l)
    join_data_r = Series([1, 2, 4], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    exp_join_data = [1, 2]
    exp_other_data = ["a", "b"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame({
        "join_col": exp_join_col,
        "B_x": exp_other_data,
        "B_y": exp_other_data,
    })

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)

コード例 #18

ファイルを表示

ファイル: test_pickling.py プロジェクト: vuule/cudf

def test_sizeof_dataframe():
    np.random.seed(0)
    df = DataFrame()
    nelem = 1000
    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
    df["vals"] = hvals = np.random.random(nelem)

    nbytes = hkeys.nbytes + hvals.nbytes
    sizeof = sys.getsizeof(df)
    assert sizeof >= nbytes

    serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL))

    # assert at least sizeof bytes were serialized
    assert serialized_nbytes >= sizeof

コード例 #19

ファイルを表示

ファイル: test_onehot.py プロジェクト: vyasr/cudf

def test_onehot_generic_index():
    np.random.seed(0)
    size = 33
    indices = np.random.randint(low=0, high=100, size=size)
    df = DataFrame()
    values = np.random.randint(low=0, high=4, size=size)
    df["fo"] = Series(values, index=GenericIndex(indices))
    out = df.one_hot_encoding(
        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
    )
    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
    np.testing.assert_array_equal(values == 0, out.fo_0.to_array())
    np.testing.assert_array_equal(values == 1, out.fo_1.to_array())
    np.testing.assert_array_equal(values == 2, out.fo_2.to_array())
    np.testing.assert_array_equal(values == 3, out.fo_3.to_array())

コード例 #20

ファイルを表示

ファイル: test_string.py プロジェクト: trevorsm7/cudf

def test_string_groupby_key_index():
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["a"] = pd.Series(str_data, dtype="str")
    gdf["a"] = Series(str_data, dtype="str")
    pdf["b"] = other_data
    gdf["b"] = other_data

    expect = pdf.groupby("a").count()
    got = gdf.groupby("a").count()

    assert_eq(expect, got, check_dtype=False)

コード例 #21

ファイルを表示

def test_groupby_iterate_groups():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    for name, grp in df.groupby(["key1", "key2"]):
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)

コード例 #22

ファイルを表示

ファイル: test_onehot.py プロジェクト: zivzone/cudf

def test_onehot_random():
    df = DataFrame()
    low = 10
    high = 17
    size = 10
    df["src"] = src = np.random.randint(low=low, high=high, size=size)
    df2 = df.one_hot_encoding(column="src",
                              prefix="out_",
                              cats=tuple(range(10, 17)))
    mat = df2.as_matrix(columns=df2.columns[1:])

    for val in range(low, high):
        colidx = val - low
        arr = mat[:, colidx]
        mask = src == val
        np.testing.assert_equal(arr, mask)

コード例 #23

ファイルを表示

ファイル: test_datetime.py プロジェクト: zhuohuwu0603/cudf

def test_datetime_scalar_timeunit_cast(timeunit):
    testscalar = np.datetime64("2016-11-20", timeunit)

    gs = Series(testscalar)
    ps = pd.Series(testscalar)
    assert_eq(ps, gs)

    gdf = DataFrame()
    gdf["a"] = np.arange(5)
    gdf["b"] = testscalar

    pdf = pd.DataFrame()
    pdf["a"] = np.arange(5)
    pdf["b"] = testscalar

    assert_eq(pdf, gdf)

コード例 #24

ファイルを表示

def test_cat_series_binop_error():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    dfa = df["a"]
    dfb = df["b"]

    # lhs is a categorical
    with pytest.raises(TypeError) as raises:
        dfa + dfb
    raises.match(
        "Series of dtype `category` cannot perform the operation: add")
    # if lhs is a numerical
    with pytest.raises(TypeError) as raises:
        dfb + dfa
    raises.match("'add' operator not supported")

コード例 #25

ファイルを表示

def test_groupby_cats():
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = df["cats"].values_host
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for i in range(len(got_vals)):
        expect = vals[cats == got_cats[i]].mean()
        np.testing.assert_almost_equal(got_vals[i], expect)

コード例 #26

ファイルを表示

ファイル: test_query.py プロジェクト: vyasr/cudf

def test_query_ref_env(data, fn):
    # prepare
    nelem, seed = data
    expect_fn, query_expr = fn
    np.random.seed(seed)
    df = DataFrame()
    df["a"] = aa = np.arange(nelem)
    df["b"] = bb = np.random.random(nelem) * nelem
    c = 2.3
    d = 1.2
    # udt
    expect_mask = expect_fn(aa, bb, c, d)
    print(expect_mask)
    df2 = df.query(query_expr)
    # check
    assert len(df2) == np.count_nonzero(expect_mask)
    np.testing.assert_array_almost_equal(df2["a"].to_array(), aa[expect_mask])
    np.testing.assert_array_almost_equal(df2["b"].to_array(), bb[expect_mask])

コード例 #27

ファイルを表示

ファイル: test_groupby.py プロジェクト: williamBlazing/cudf

def test_groupby_cats(method):
    df = DataFrame()
    df["cats"] = pd.Categorical(list("aabaacaab"))
    df["vals"] = np.random.random(len(df))

    cats = np.asarray(list(df["cats"]))
    vals = df["vals"].to_array()

    grouped = df.groupby(["cats"], method=method, as_index=False).mean()

    got_vals = grouped["vals"]

    got_cats = grouped["cats"]

    for c, v in zip(got_cats, got_vals):
        print(c, v)
        expect = vals[cats == c].mean()
        np.testing.assert_almost_equal(v, expect)

コード例 #28

ファイルを表示

ファイル: test_groupby.py プロジェクト: williamBlazing/cudf

def test_groupby_as_df():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    def assert_values_equal(arr):
        np.testing.assert_array_equal(arr[0], arr)

    df, segs = df.groupby(["key1", "key2"], method="cudf").as_df()
    for s, e in zip(segs, list(segs[1:]) + [None]):
        grp = df[s:e]
        pddf = grp.to_pandas()
        for k in "key1,key2".split(","):
            assert_values_equal(pddf[k].values)

コード例 #29

ファイルを表示

ファイル: test_onehot.py プロジェクト: zivzone/cudf

def test_get_dummies_prefix_sep(prefix, prefix_sep):
    data = {
        "first": ["1", "2", "3"],
        "second": ["abc", "def", "ghi"],
        "third": ["ji", "ji", "ji"],
    }

    gdf = DataFrame(data)
    pdf = pd.DataFrame(data)

    encoded_expected = pd.get_dummies(pdf,
                                      prefix=prefix,
                                      prefix_sep=prefix_sep)
    encoded_actual = cudf.get_dummies(gdf,
                                      prefix=prefix,
                                      prefix_sep=prefix_sep)

    utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)

コード例 #30

ファイルを表示

ファイル: test_string.py プロジェクト: trevorsm7/cudf

def test_string_groupby_key(str_data, num_keys):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_keys):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    expect = pdf.groupby(list(range(num_keys)), as_index=False).count()
    got = gdf.groupby(list(range(num_keys)), as_index=False).count()

    expect = expect.sort_values([0]).reset_index(drop=True)
    got = got.sort_values([0]).reset_index(drop=True)

    assert_eq(expect, got, check_dtype=False)