コード例 #1
0
ファイル: test_from_cudf.py プロジェクト: runPenguin/xgboost
def test_cudf_training_with_sklearn():
    from cudf import DataFrame as df
    from cudf import Series as ss
    import pandas as pd
    np.random.seed(1)
    X = pd.DataFrame(np.random.randn(50, 10))
    y = pd.DataFrame((np.random.randn(50) > 0).astype(np.int8))
    weights = np.random.random(50) + 1.0
    cudf_weights = df.from_pandas(pd.DataFrame(weights))
    base_margin = np.random.random(50)
    cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))

    X_cudf = df.from_pandas(X)
    y_cudf = df.from_pandas(y)
    y_cudf_series = ss(data=y.iloc[:, 0])

    for y_obj in [y_cudf, y_cudf_series]:
        clf = xgb.XGBClassifier(gpu_id=0, tree_method='gpu_hist')
        clf.fit(X_cudf,
                y_obj,
                sample_weight=cudf_weights,
                base_margin=cudf_base_margin,
                eval_set=[(X_cudf, y_obj)])
        pred = clf.predict(X_cudf)
        assert np.array_equal(np.unique(pred), np.array([0, 1]))
コード例 #2
0
ファイル: test_from_cudf.py プロジェクト: runPenguin/xgboost
def _test_cudf_training(DMatrixT):
    from cudf import DataFrame as df
    import pandas as pd
    np.random.seed(1)
    X = pd.DataFrame(np.random.randn(50, 10))
    y = pd.DataFrame(np.random.randn(50))
    weights = np.random.random(50) + 1.0
    cudf_weights = df.from_pandas(pd.DataFrame(weights))
    base_margin = np.random.random(50)
    cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))

    evals_result_cudf = {}
    dtrain_cudf = DMatrixT(df.from_pandas(X),
                           df.from_pandas(y),
                           weight=cudf_weights,
                           base_margin=cudf_base_margin)
    params = {'gpu_id': 0, 'tree_method': 'gpu_hist'}
    xgb.train(params,
              dtrain_cudf,
              evals=[(dtrain_cudf, "train")],
              evals_result=evals_result_cudf)
    evals_result_np = {}
    dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
    xgb.train(params,
              dtrain_np,
              evals=[(dtrain_np, "train")],
              evals_result=evals_result_np)
    assert np.array_equal(evals_result_cudf["train"]["rmse"],
                          evals_result_np["train"]["rmse"])
コード例 #3
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_dataframe_column_name_indexing():
    df = DataFrame()
    data = np.asarray(range(10), dtype=np.int32)
    df["a"] = data
    df[1] = data
    np.testing.assert_equal(df["a"].to_array(),
                            np.asarray(range(10), dtype=np.int32))
    np.testing.assert_equal(df[1].to_array(),
                            np.asarray(range(10), dtype=np.int32))

    pdf = pd.DataFrame()
    nelem = 10
    pdf["key1"] = np.random.randint(0, 5, nelem)
    pdf["key2"] = np.random.randint(0, 3, nelem)
    pdf[1] = np.arange(1, 1 + nelem)
    pdf[2] = np.random.random(nelem)
    df = DataFrame.from_pandas(pdf)

    assert_eq(df[df.columns], df)
    assert_eq(df[df.columns[:1]], df[["key1"]])

    for i in range(1, len(pdf.columns) + 1):
        for idx in combinations(pdf.columns, i):
            assert pdf[list(idx)].equals(df[list(idx)].to_pandas())

    # test for only numeric columns
    df = pd.DataFrame()
    for i in range(0, 10):
        df[i] = range(nelem)
    gdf = DataFrame.from_pandas(df)
    assert_eq(gdf, df)

    assert_eq(gdf[gdf.columns], gdf)
    assert_eq(gdf[gdf.columns[:3]], gdf[[0, 1, 2]])
コード例 #4
0
    def test_cudf_training(self):
        from cudf import DataFrame as df
        import pandas as pd
        X = pd.DataFrame(np.random.randn(50, 10))
        y = pd.DataFrame(np.random.randn(50))
        weights = np.random.random(50)
        cudf_weights = df.from_pandas(pd.DataFrame(weights))
        base_margin = np.random.random(50)
        cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))

        evals_result_cudf = {}
        dtrain_cudf = xgb.DMatrix(df.from_pandas(X),
                                  df.from_pandas(y),
                                  weight=cudf_weights,
                                  base_margin=cudf_base_margin)
        xgb.train({'gpu_id': 0},
                  dtrain_cudf,
                  evals=[(dtrain_cudf, "train")],
                  evals_result=evals_result_cudf)
        evals_result_np = {}
        dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
        xgb.train({},
                  dtrain_np,
                  evals=[(dtrain_np, "train")],
                  evals_result=evals_result_np)
        assert np.array_equal(evals_result_cudf["train"]["rmse"],
                              evals_result_np["train"]["rmse"])
コード例 #5
0
ファイル: test_sorting.py プロジェクト: TravisHester/cudf
def test_dataframe_multi_column_nulls(
    num_cols, num_rows, dtype, nulls, ascending, na_position
):

    np.random.seed(0)
    by = list(string.ascii_lowercase[:num_cols])
    pdf = pd.DataFrame()

    for i in range(3):
        colname = string.ascii_lowercase[i]
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.array([], dtype="int64")
            if num_rows > 0:
                idx = np.random.choice(
                    num_rows, size=int(num_rows / 4), replace=False
                )
            data[idx] = np.nan
        elif nulls == "all":
            data[:] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.sort_values(by, ascending=ascending, na_position=na_position)
    expect = pdf.sort_values(by, ascending=ascending, na_position=na_position)

    assert_eq(
        got[by].reset_index(drop=True), expect[by].reset_index(drop=True)
    )
コード例 #6
0
ファイル: test_indexing.py プロジェクト: wphicks/cudf
def test_dataframe_loc_mask(mask, arg):
    pdf = pd.DataFrame(
        {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]}
    )
    gdf = DataFrame.from_pandas(pdf)

    assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg])
コード例 #7
0
def generate_inputs_from_categories(categories=None,
                                    n_samples=10,
                                    seed=5060,
                                    as_array=False):
    if categories is None:
        if as_array:
            categories = {
                'strings': list(range(1000, 4000, 3)),
                'integers': list(range(1000))
            }
        else:
            categories = {
                'strings': ['Foo', 'Bar', 'Baz'],
                'integers': list(range(1000))
            }

    rd = np.random.RandomState(seed)
    pandas_df = pd.DataFrame(
        {name: rd.choice(cat, n_samples)
         for name, cat in categories.items()})
    ary = from_df_to_array(pandas_df)
    if as_array:
        inp_ary = cp.array(ary)
        return inp_ary, ary
    else:
        df = DataFrame.from_pandas(pandas_df)
        return df, ary
コード例 #8
0
    def test_cudf_metainfo(self):
        from cudf import DataFrame as df
        import pandas as pd
        n = 100
        X = np.random.random((n, 2))
        dmat_cudf = xgb.DMatrix(X)
        dmat = xgb.DMatrix(X)
        floats = np.random.random(n)
        uints = np.array([4, 2, 8]).astype("uint32")
        cudf_floats = df.from_pandas(pd.DataFrame(floats))
        cudf_uints = df.from_pandas(pd.DataFrame(uints))
        dmat.set_float_info('weight', floats)
        dmat.set_float_info('label', floats)
        dmat.set_float_info('base_margin', floats)
        dmat.set_uint_info('group', uints)
        dmat_cudf.set_interface_info('weight', cudf_floats)
        dmat_cudf.set_interface_info('label', cudf_floats)
        dmat_cudf.set_interface_info('base_margin', cudf_floats)
        dmat_cudf.set_interface_info('group', cudf_uints)

        # Test setting info with cudf DataFrame
        assert np.array_equal(dmat.get_float_info('weight'),
                              dmat_cudf.get_float_info('weight'))
        assert np.array_equal(dmat.get_float_info('label'),
                              dmat_cudf.get_float_info('label'))
        assert np.array_equal(dmat.get_float_info('base_margin'),
                              dmat_cudf.get_float_info('base_margin'))
        assert np.array_equal(dmat.get_uint_info('group_ptr'),
                              dmat_cudf.get_uint_info('group_ptr'))

        # Test setting info with cudf Series
        dmat_cudf.set_interface_info('weight',
                                     cudf_floats[cudf_floats.columns[0]])
        dmat_cudf.set_interface_info('label',
                                     cudf_floats[cudf_floats.columns[0]])
        dmat_cudf.set_interface_info('base_margin',
                                     cudf_floats[cudf_floats.columns[0]])
        dmat_cudf.set_interface_info('group',
                                     cudf_uints[cudf_uints.columns[0]])
        assert np.array_equal(dmat.get_float_info('weight'),
                              dmat_cudf.get_float_info('weight'))
        assert np.array_equal(dmat.get_float_info('label'),
                              dmat_cudf.get_float_info('label'))
        assert np.array_equal(dmat.get_float_info('base_margin'),
                              dmat_cudf.get_float_info('base_margin'))
        assert np.array_equal(dmat.get_uint_info('group_ptr'),
                              dmat_cudf.get_uint_info('group_ptr'))
コード例 #9
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_dataframe_apply_boolean_mask():
    pdf = pd.DataFrame({
        "a": [0, 1, 2, 3],
        "b": [0.1, 0.2, None, 0.3],
        "c": ["a", None, "b", "c"],
    })
    gdf = DataFrame.from_pandas(pdf)
    assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]])
コード例 #10
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_dataframe_boolean_mask_with_None():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    gdf = DataFrame.from_pandas(pdf)
    pdf_masked = pdf[[True, False, True, False]]
    gdf_masked = gdf[[True, False, True, False]]
    assert_eq(pdf_masked, gdf_masked)

    with pytest.raises(ValueError):
        gdf[Series([True, False, None, False])]
コード例 #11
0
ファイル: test_sorting.py プロジェクト: TravisHester/cudf
def test_dataframe_sort_values_sliced(nelem, sliceobj):
    np.random.seed(0)
    df = pd.DataFrame()
    df["a"] = np.random.random(nelem)

    expect = df[sliceobj]["a"].sort_values()
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj]["a"].sort_values()
    assert (got.to_pandas() == expect).all()
コード例 #12
0
ファイル: test_query.py プロジェクト: rongou/cudf
def test_query_empty_frames():
    empty_pdf = pd.DataFrame({"a": [], "b": []})
    empty_gdf = DataFrame.from_pandas(empty_pdf)
    # Do the query
    expr = "a > 2"
    got = empty_gdf.query(expr).to_pandas()
    expect = empty_pdf.query(expr)

    # assert equal results
    assert_eq(got, expect)
コード例 #13
0
ファイル: test_onehot.py プロジェクト: rongou/cudf
def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na):
    pdf = pd.DataFrame({"a": [0, 1, np.nan]})
    df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null)

    expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"])
    got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"])

    if dummy_na and nan_as_null:
        got = got.rename(columns={"a_null": "a_nan"})[expected.columns]

    utils.assert_eq(expected, got)
コード例 #14
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_sliced_indexing():
    a = list(range(4, 4 + 150))
    b = list(range(0, 0 + 150))
    pdf = pd.DataFrame({"a": a, "b": b})
    gdf = DataFrame.from_pandas(pdf)
    pdf = pdf.set_index("a")
    gdf = gdf.set_index("a")
    pidx = pdf.index[:75]
    gidx = gdf.index[:75]

    assert_eq(pdf.loc[pidx], gdf.loc[gidx])
コード例 #15
0
ファイル: test_pandas_interop.py プロジェクト: rongou/cudf
def test_from_pandas_ex1():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    df = DataFrame.from_pandas(pdf)

    assert tuple(df.columns) == tuple(pdf.columns)
    assert np.all(df["a"].to_numpy() == pdf["a"])
    matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"]
    # the 3d element is False due to (nan == nan) == False
    assert np.all(matches == [True, True, False, True])
    assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2])
    assert np.isnan(pdf["b"][2])
コード例 #16
0
ファイル: test_sorting.py プロジェクト: TravisHester/cudf
def test_dataframe_nsmallest_sliced(counts, sliceobj):
    nelem, n = counts
    np.random.seed(0)
    df = pd.DataFrame()
    df["a"] = np.random.random(nelem)
    df["b"] = np.random.random(nelem)

    expect = df[sliceobj].nsmallest(n, "a")
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj].nsmallest(n, "a")
    assert (got.to_pandas() == expect).all().all()
コード例 #17
0
ファイル: test_pandas_interop.py プロジェクト: rongou/cudf
def test_from_pandas_with_index():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    pdf = pdf.set_index(np.asarray([4, 3, 2, 1]))
    df = DataFrame.from_pandas(pdf)

    # Check columns
    assert_eq(df.a, pdf.a)
    assert_eq(df.b, pdf.b)
    # Check index
    assert_eq(df.index.values, pdf.index.values)
    # Check again using pandas testing tool on frames
    assert_eq(df, pdf)
コード例 #18
0
ファイル: test_indexing.py プロジェクト: trevorsm7/cudf
def test_dataframe_loc(scalar, step):
    size = 123
    pdf = pd.DataFrame(
        {
            "a": np.random.randint(low=0, high=100, size=size),
            "b": np.random.random(size).astype(np.float32),
            "c": np.random.random(size).astype(np.float64),
            "d": np.random.random(size).astype(np.float64),
        }
    )

    df = DataFrame.from_pandas(pdf)

    # Scalar label
    assert_eq(df.loc[scalar], pdf.loc[scalar])

    # Full slice
    assert_eq(df.loc[:, "c"], pdf.loc[:, "c"])

    begin = 110
    end = 122

    assert_eq(
        df.loc[begin:end:step, ["c", "d", "a"]],
        pdf.loc[begin:end:step, ["c", "d", "a"]],
    )

    assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]])

    # Slicing on columns:
    assert_eq(
        df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"]
    )

    # Slicing of size 1:
    assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"])

    # TODO: Pandas changes the dtype here when it shouldn't
    assert_eq(
        df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False
    )

    # Make int64 index
    offset = 50
    df2 = df[offset:]
    pdf2 = pdf[offset:]
    begin = 117
    end = 122
    assert_eq(
        df2.loc[begin:end, ["c", "d", "a"]],
        pdf2.loc[begin:end, ["c", "d", "a"]],
    )
コード例 #19
0
ファイル: test_query.py プロジェクト: rongou/cudf
def test_query_with_index_keyword(query, a_val, b_val, c_val):
    pdf = pd.DataFrame({
        "a": [1, None, 3, 4, 5],
        "b": [5, 4, 3, 2, 1],
        "c": [12, 15, 17, 19, 27],
    })
    pdf.set_index("a")

    gdf = DataFrame.from_pandas(pdf)

    out = gdf.query(query)
    expect = pdf.query(query)

    assert_eq(out, expect)
コード例 #20
0
ファイル: test_pandas_interop.py プロジェクト: rongou/cudf
def test_from_pandas():
    pdf = pd.DataFrame()
    pdf["a"] = np.arange(10, dtype=np.int32)
    pdf["b"] = np.arange(10, 20, dtype=np.float64)

    df = DataFrame.from_pandas(pdf)

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df["a"].dtype == pdf["a"].dtype
    assert df["b"].dtype == pdf["b"].dtype

    assert len(df["a"]) == len(pdf["a"])
    assert len(df["b"]) == len(pdf["b"])
コード例 #21
0
def test_dataframe_multi_column_nulls_multiple_ascending(
        ascending, na_position):
    pdf = pd.DataFrame({
        "a": [3, 1, None, 2, 2, None, 1],
        "b": [1, 2, 3, 4, 5, 6, 7]
    })
    gdf = DataFrame.from_pandas(pdf)
    expect = pdf.sort_values(by=["a", "b"],
                             ascending=ascending,
                             na_position=na_position)
    actual = gdf.sort_values(by=["a", "b"],
                             ascending=ascending,
                             na_position=na_position)

    assert_eq(actual, expect)
コード例 #22
0
def test_rank_error_arguments(pdf):
    gdf = DataFrame.from_pandas(pdf)

    assert_exceptions_equal(
        lfunc=pdf["col1"].rank,
        rfunc=gdf["col1"].rank,
        lfunc_args_and_kwargs=(
            [],
            {
                "method": "randomname",
                "na_option": "keep",
                "ascending": True,
                "pct": True,
            },
        ),
        rfunc_args_and_kwargs=(
            [],
            {
                "method": "randomname",
                "na_option": "keep",
                "ascending": True,
                "pct": True,
            },
        ),
    )

    assert_exceptions_equal(
        lfunc=pdf["col1"].rank,
        rfunc=gdf["col1"].rank,
        lfunc_args_and_kwargs=(
            [],
            {
                "method": "first",
                "na_option": "randomname",
                "ascending": True,
                "pct": True,
            },
        ),
        rfunc_args_and_kwargs=(
            [],
            {
                "method": "first",
                "na_option": "randomname",
                "ascending": True,
                "pct": True,
            },
        ),
    )
コード例 #23
0
def test_dataframe_multi_column(num_cols, num_rows, dtype, ascending,
                                na_position):

    np.random.seed(0)
    by = list(string.ascii_lowercase[:num_cols])
    pdf = pd.DataFrame()

    for i in range(5):
        colname = string.ascii_lowercase[i]
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.sort_values(by, ascending=ascending, na_position=na_position)
    expect = pdf.sort_values(by, ascending=ascending, na_position=na_position)

    assert_eq(got[by].reset_index(drop=True),
              expect[by].reset_index(drop=True))
コード例 #24
0
ファイル: test_query.py プロジェクト: rongou/cudf
def test_query_splitted_combine():
    np.random.seed(0)
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10),
        "y": np.random.normal(size=10)
    })
    gdf = DataFrame.from_pandas(df)

    # Split the GDF
    s1 = gdf[:5]
    s2 = gdf[5:]

    # Do the query
    expr = "x > 2"
    q1 = s1.query(expr)
    q2 = s2.query(expr)
    # Combine
    got = cudf.concat([q1, q2]).to_pandas()

    # Should equal to just querying the original GDF
    expect = gdf.query(expr).to_pandas()
    assert_eq(got, expect, check_index_type=True)
コード例 #25
0
def test_issue_165():
    df_pandas = pd.DataFrame()
    start_date = dt.datetime.strptime("2000-10-21", "%Y-%m-%d")
    data = [(start_date + dt.timedelta(days=x)) for x in range(6)]
    df_pandas["dates"] = data
    df_pandas["num"] = [1, 2, 3, 4, 5, 6]
    df_cudf = DataFrame.from_pandas(df_pandas)

    base = df_pandas.query("dates==@start_date")
    test = df_cudf.query("dates==@start_date")
    assert_eq(base, test)
    assert len(test) > 0

    mask = df_cudf.dates == start_date
    base_mask = df_pandas.dates == start_date
    assert_eq(mask, base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0

    start_date_ts = pd.Timestamp(start_date)
    test = df_cudf.query("dates==@start_date_ts")
    base = df_pandas.query("dates==@start_date_ts")
    assert_eq(base, test)
    assert len(test) > 0

    mask = df_cudf.dates == start_date_ts
    base_mask = df_pandas.dates == start_date_ts
    assert_eq(mask, base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0

    start_date_np = np.datetime64(start_date_ts, "ns")
    test = df_cudf.query("dates==@start_date_np")
    base = df_pandas.query("dates==@start_date_np")
    assert_eq(base, test)
    assert len(test) > 0

    mask = df_cudf.dates == start_date_np
    base_mask = df_pandas.dates == start_date_np
    assert_eq(mask, base_mask, check_names=False)
    assert mask.to_pandas().sum() > 0
コード例 #26
0
ファイル: test_rank.py プロジェクト: rongou/cudf
def test_rank_all_arguments(pdf, dtype, ascending, method, na_option, pct,
                            numeric_only):
    if method == "first" and dtype == "O":
        # not supported by pandas
        return

    pdf = pdf.copy(deep=True)  # for parallel pytest
    if numeric_only:
        pdf["str"] = np.array(
            ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"])
    gdf = DataFrame.from_pandas(pdf)

    kwargs = {
        "method": method,
        "na_option": na_option,
        "ascending": ascending,
        "pct": pct,
        "numeric_only": numeric_only,
    }

    # Series
    assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
    assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
    if numeric_only:
        expect = pdf["str"].rank(**kwargs)
        got = gdf["str"].rank(**kwargs)
        assert expect.empty == got.empty
        expected = pdf.select_dtypes(include=np.number)
    else:
        expected = pdf.copy(deep=True)

    # TODO: Remove per column iteration once the
    # following issue is fixed :
    # https://github.com/pandas-dev/pandas/issues/43310
    for col in expected.columns:
        expected[col] = pdf[col].rank(**kwargs)
    actual = gdf.rank(**kwargs)
    assert_eq(expected, actual)
コード例 #27
0
def test_rank_all_arguments(pdf, dtype, ascending, method, na_option, pct,
                            numeric_only):
    if method == "first" and dtype == "O":
        # not supported by pandas
        return

    pdf = pdf.copy(deep=True)  # for parallel pytest
    if numeric_only:
        pdf["str"] = np.array(
            ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"])
    gdf = DataFrame.from_pandas(pdf)

    kwargs = {
        "method": method,
        "na_option": na_option,
        "ascending": ascending,
        "pct": pct,
        "numeric_only": numeric_only,
    }

    # Series
    assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs))
    assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs))
    if numeric_only:
        expect = pdf["str"].rank(**kwargs)
        got = gdf["str"].rank(**kwargs)
        assert expect.empty == got.empty

    # TODO: https://github.com/pandas-dev/pandas/issues/32593
    # Dataframe (bug in pandas)
    if (na_option == "top" and method == "first" and not dtype == "O"
            and ascending):
        assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
    else:
        with pytest.raises(AssertionError, match="values are different"):
            assert_eq(gdf.rank(**kwargs), pdf.rank(**kwargs))
コード例 #28
0
ファイル: multiindex.py プロジェクト: zhuohuwu0603/cudf
    def __init__(self,
                 levels=None,
                 codes=None,
                 labels=None,
                 names=None,
                 **kwargs):
        from cudf.core.series import Series
        from cudf import DataFrame

        super().__init__()

        self._name = None

        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' "
                "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            source_data = kwargs["source_data"].copy(deep=False)
            source_data.reset_index(drop=True, inplace=True)

            if isinstance(source_data, pd.DataFrame):
                nan_as_null = kwargs.get("nan_as_null", None)
                source_data = DataFrame.from_pandas(source_data,
                                                    nan_as_null=nan_as_null)
            names = names if names is not None else source_data._data.names
            # if names are unique
            # try using those as the source_data column names:
            if len(dict.fromkeys(names)) == len(names):
                source_data.columns = names
            self._data = source_data._data
            self.names = names
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
                names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        if not isinstance(codes, DataFrame) and not isinstance(
                codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = column.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError("MultiIndex has unequal number of levels and "
                             "codes and is inconsistent!")

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            if -1 in self._codes[name].values:
                # Must account for null(s) in _source_data column
                level = DataFrame(
                    {name: [None] + list(self._levels[i])},
                    index=range(-1, len(self._levels[i])),
                )
            else:
                level = DataFrame({name: self._levels[i]})

            import cudf._lib as libcudf

            source_data[name] = libcudf.copying.gather(
                level, codes._data.columns[0])._data[name]

        self._data = source_data._data
        self.names = names
コード例 #29
0
ファイル: test_indexing.py プロジェクト: wenxiang-Li/cudf
def test_dataframe_loc_duplicate_index_scalar():
    pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2])
    gdf = DataFrame.from_pandas(pdf)

    assert_eq(pdf.loc[2], gdf.loc[2])
コード例 #30
0
ファイル: test_indexing.py プロジェクト: trevorsm7/cudf
def test_dataframe_boolean_mask_with_None():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    gdf = DataFrame.from_pandas(pdf)
    pdf_masked = pdf[[True, False, True, False]]
    gdf_masked = gdf[[True, False, True, False]]
    assert_eq(pdf_masked, gdf_masked)