Esempio n. 1
0
def test_masked_encode(client):
    n_workers = len(client.has_what())
    df = cudf.DataFrame({
        "filter_col": [1, 1, 2, 3, 1, 1, 1, 1, 6, 5],
        "cat_col": ['a', 'b', 'c', 'd', 'a', 'a', 'a', 'c', 'b', 'c']
    })
    ddf = dask_cudf.from_cudf(df, npartitions=n_workers)

    ddf_filter = ddf[ddf["filter_col"] == 1]
    filter_encoded = LabelEncoder().fit_transform(ddf_filter["cat_col"])
    ddf_filter = ddf_filter.assign(filter_encoded=filter_encoded.values)

    encoded_filter = LabelEncoder().fit_transform(ddf["cat_col"])
    ddf = ddf.assign(encoded_filter=encoded_filter.values)

    ddf = ddf[ddf.filter_col == 1]

    assert (ddf.encoded_filter == ddf_filter.filter_encoded).compute().all()
Esempio n. 2
0
def _prep_training_data(c,
                        X_train,
                        partitions_per_worker,
                        reverse_order=False):
    workers = c.has_what().keys()

    if reverse_order:
        workers = list(workers)[::-1]

    n_partitions = partitions_per_worker * len(workers)

    X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))

    X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions)
    X_train_df, = dask_utils.persist_across_workers(c, [X_train_df],
                                                    workers=list(workers))

    return X_train_df
Esempio n. 3
0
def test_mixing_series_frame_error():
    nelem = 20

    df = gd.DataFrame()
    df["x"] = np.arange(nelem)
    df["y"] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_cudf(df, npartitions=5)

    delay_frame = ddf.to_delayed()
    delay_series = ddf.x.to_delayed()
    combined = dgd.from_delayed(delay_frame + delay_series)

    with pytest.raises(ValueError) as raises:
        combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"Expected partition of type `DataFrame` but got `Series`")
Esempio n. 4
0
def test_groupby_std(func):
    pdf = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10000),
        "y": np.random.normal(size=10000),
    })

    gdf = cudf.DataFrame.from_pandas(pdf)

    ddf = dask_cudf.from_cudf(gdf, npartitions=5)

    a = func(gdf.to_pandas())
    b = func(ddf).compute().to_pandas()

    a.index.name = None
    a.name = None
    b.index.name = None

    dd.assert_eq(a, b)
Esempio n. 5
0
def test_dataframe_assign_col():
    df = cudf.DataFrame(list(range(100)))
    pdf = pd.DataFrame(list(range(100)))

    ddf = dgd.from_cudf(df, npartitions=4)
    ddf["fold"] = 0
    ddf["fold"] = ddf["fold"].map_partitions(
        lambda cudf_df: cp.random.randint(0, 4, len(cudf_df))
    )

    pddf = dd.from_pandas(pdf, npartitions=4)
    pddf["fold"] = 0
    pddf["fold"] = pddf["fold"].map_partitions(
        lambda p_df: np.random.randint(0, 4, len(p_df))
    )

    dd.assert_eq(ddf[0], pddf[0])
    dd.assert_eq(len(ddf["fold"]), len(pddf["fold"]))
Esempio n. 6
0
def test_query():
    np.random.seed(0)

    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10),
        "y": np.random.normal(size=10)
    })
    gdf = gd.DataFrame.from_pandas(df)
    expr = "x > 2"

    assert_frame_equal(gdf.query(expr).to_pandas(), df.query(expr))

    queried = dgd.from_cudf(gdf, npartitions=2).query(expr)

    got = queried.compute().to_pandas()
    expect = gdf.query(expr).to_pandas()

    assert_frame_equal(got, expect)
Esempio n. 7
0
def test_set_index_w_series():
    with dask.config.set(scheduler="single-threaded"):
        nelem = 20
        np.random.seed(0)
        df = pd.DataFrame({
            "x":
            100 + np.random.randint(0, nelem // 2, size=nelem),
            "y":
            np.random.normal(size=nelem),
        })
        expect = df.set_index(df.x).sort_index()

        dgf = dgd.from_cudf(gd.DataFrame.from_pandas(df), npartitions=4)
        res = dgf.set_index(dgf.x)  # sort by default
        got = res.compute().to_pandas()

        assert set(expect.columns) == set(got.columns)
        assert_frame_equal_by_index_group(expect, got)
Esempio n. 8
0
def test_target_encode_multi(tmpdir, npartitions):

    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = cudf.DataFrame({
        "cat": cat_1,
        "cat2": cat_2,
        "num": num_1,
        "num_2": num_2
    })
    df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]
    te_features = cat_groups >> ops.TargetEncoding(["num", "num_2"],
                                                   out_path=str(tmpdir),
                                                   kfold=1,
                                                   p_smooth=5,
                                                   out_dtype="float32")

    workflow = nvt.Workflow(te_features)

    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values,
              df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0],
                        num_1.mean(),
                        abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0],
                        num_2.mean(),
                        abs_tol=1e-3)
Esempio n. 9
0
def create_tables(bc, dir_data_lc, fileSchemaType, **kwargs):
    ext = get_extension(fileSchemaType)

    tables = kwargs.get('tables', tpchTables)
    bool_orders_index = kwargs.get('bool_orders_index', -1)

    dir_data_lc = dir_data_lc + "tpch/"

    for i, table in enumerate(tables):
        # using wildcard, note the _ after the table name (it will avoid collisions)
        table_files = ("%s/%s_[0-9]*.%s") % (dir_data_lc, table, ext)
        t = None
        if fileSchemaType == DataType.CSV:
            bool_orders_flag = False

            if i == bool_orders_index:
                bool_orders_flag = True

            dtypes = get_dtypes(table, bool_orders_flag)
            col_names = get_column_names(table, bool_orders_flag)
            t = bc.create_table(table,
                                table_files,
                                delimiter='|',
                                dtype=dtypes,
                                names=col_names)
        elif fileSchemaType == DataType.CUDF:
            bool_column = bool_orders_index != -1
            gdf = read_data(table, dir_data_lc, bool_column)
            t = bc.create_table(table, gdf)
        elif fileSchemaType == DataType.DASK_CUDF:
            bool_column = bool_orders_index != -1
            gdf = read_data(table, dir_data_lc, bool_column)
            nRals = Settings.data['RunSettings']['nRals']
            num_partitions = nRals
            ds = dask_cudf.from_cudf(gdf, npartitions=num_partitions)
            t = bc.create_table(table, ds)
        # elif fileSchemaType == DataType.DASK_CUDF:
        #     bool_column = bool_orders_index != -1
        #     table_files = ("%s/%s_[0-9]*.%s") % (dir_data_lc, table, 'parquet')
        #     dask_df = dask_cudf.read_parquet(table_files)
        #     dask_df = bc.unify_partitions(dask_df)
        #     t = bc.create_table(table, dask_df)
        else:
            t = bc.create_table(table, table_files)
Esempio n. 10
0
def test_target_encode_multi(tmpdir, npartitions):

    cat_1 = np.asarray(["baaaa"] * 12)
    cat_2 = np.asarray(["baaaa"] * 6 + ["bbaaa"] * 3 + ["bcaaa"] * 3)
    num_1 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4])
    num_2 = np.asarray([1, 1, 2, 2, 2, 1, 1, 5, 4, 4, 4, 4]) * 2
    df = cudf.DataFrame({"cat": cat_1, "cat2": cat_2, "num": num_1, "num_2": num_2})
    df = dask_cudf.from_cudf(df, npartitions=npartitions)

    cat_names = ["cat", "cat2"]
    cont_names = ["num", "num_2"]
    label_name = []
    processor = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=label_name)

    cat_groups = ["cat", "cat2", ["cat", "cat2"]]

    processor.add_preprocess(
        ops.TargetEncoding(
            cat_groups,
            ["num", "num_2"],  # cont_target
            out_path=str(tmpdir),
            kfold=1,
            p_smooth=5,
            out_dtype="float32",
        )
    )
    processor.finalize()
    processor.apply(nvt.Dataset(df), output_format=None)
    df_out = processor.get_ddf().compute(scheduler="synchronous")

    assert "TE_cat_cat2_num" in df_out.columns
    assert "TE_cat_num" in df_out.columns
    assert "TE_cat2_num" in df_out.columns
    assert "TE_cat_cat2_num_2" in df_out.columns
    assert "TE_cat_num_2" in df_out.columns
    assert "TE_cat2_num_2" in df_out.columns

    assert_eq(df_out["TE_cat2_num"].values, df_out["TE_cat_cat2_num"].values)
    assert_eq(df_out["TE_cat2_num_2"].values, df_out["TE_cat_cat2_num_2"].values)
    assert df_out["TE_cat_num"].iloc[0] != df_out["TE_cat2_num"].iloc[0]
    assert df_out["TE_cat_num_2"].iloc[0] != df_out["TE_cat2_num_2"].iloc[0]
    assert math.isclose(df_out["TE_cat_num"].iloc[0], num_1.mean(), abs_tol=1e-4)
    assert math.isclose(df_out["TE_cat_num_2"].iloc[0], num_2.mean(), abs_tol=1e-3)
Esempio n. 11
0
    def __init__(self,
                 input,
                 fileType,
                 files=None,
                 datasource=[],
                 calcite_to_file_indices=None,
                 num_row_groups=None,
                 args={},
                 convert_gdf_to_dask=False,
                 convert_gdf_to_dask_partitions=1,
                 client=None,
                 uri_values=[],
                 in_file=[],
                 force_conversion=False):
        self.fileType = fileType
        if fileType == DataType.ARROW:
            if force_conversion:
                #converts to cudf for querying
                self.input = cudf.DataFrame.from_arrow(input)
                self.fileType = DataType.CUDF
            else:
                self.input = cudf.DataFrame.from_arrow(
                    input.schema.empty_table())
                self.arrow_table = input
        else:
            self.input = input

        self.calcite_to_file_indices = calcite_to_file_indices
        self.files = files

        self.datasource = datasource
        self.num_row_groups = num_row_groups

        self.args = args
        if fileType == DataType.CUDF or DataType.DASK_CUDF:
            if (convert_gdf_to_dask
                    and isinstance(self.input, cudf.DataFrame)):
                self.input = dask_cudf.from_cudf(
                    self.input, npartitions=convert_gdf_to_dask_partitions)
            if (isinstance(self.input, dask_cudf.core.DataFrame)):
                self.dask_mapping = getNodePartitions(self.input, client)
        self.uri_values = uri_values
        self.in_file = in_file
Esempio n. 12
0
def test_groupby_multiindex_reset_index(npartitions):
    df = cudf.DataFrame({
        "a": [1, 1, 2, 3, 4],
        "b": [5, 2, 1, 2, 5],
        "c": [1, 2, 2, 3, 5]
    })
    ddf = dask_cudf.from_cudf(df, npartitions=npartitions)
    pddf = dd.from_pandas(df.to_pandas(), npartitions=npartitions)
    gr = ddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index()
    pr = pddf.groupby(["a", "c"]).agg({"b": ["count"]}).reset_index()

    # CuDF uses "int32" for count. Pandas uses "int64"
    gr_out = gr.compute().sort_values(by=["a", "c"]).reset_index(drop=True)
    gr_out[("b", "count")] = gr_out[("b", "count")].astype("int64")

    dd.assert_eq(
        gr_out,
        pr.compute().sort_values(by=["a", "c"]).reset_index(drop=True),
    )
Esempio n. 13
0
def test_multicolumn_groupby():
    import cudf, dask_cudf

    tmp_df = cudf.DataFrame()
    tmp_df['id'] = [0, 0, 1, 2, 2, 2]
    tmp_df['val1'] = [0, 1, 0, 0, 1, 2]
    tmp_df['val2'] = [9, 9, 9, 9, 9, 9]

    ddf = dask_cudf.from_cudf(tmp_df, npartitions=2)

    actual = ddf.groupby(['id', 'val1']).count().compute()

    # FIXME: this is not idiomatic cudf!
    expectedVals = [1, 1, 1, 1, 1, 1]
    expected = cudf.DataFrame()
    expected['val'] = expectedVals

    assert False not in (
        expected.to_pandas().values == actual.to_pandas().values)
Esempio n. 14
0
def get_clusters(client, ml_input_df):
    import dask_cudf

    ml_tasks = [
        delayed(train_clustering_model)(df, N_CLUSTERS, CLUSTER_ITERATIONS,
                                        N_ITER)
        for df in ml_input_df.to_delayed()
    ]
    results_dict = client.compute(*ml_tasks, sync=True)

    output = ml_input_df.index.to_frame().reset_index(drop=True)

    labels_final = dask_cudf.from_cudf(results_dict["cid_labels"],
                                       npartitions=output.npartitions)
    output["label"] = labels_final.reset_index()[0]

    # Based on CDH6.1 q25-result formatting
    results_dict["cid_labels"] = output
    return results_dict
Esempio n. 15
0
def test_get_dummies_large():
    gdf = cudf.datasets.randomdata(
        nrows=200000,
        dtypes={
            "C": int,
            "first": "category",
            "b": float,
            "second": "category",
        },
    )
    df = gdf.to_pandas()
    ddf = dd.from_pandas(df, npartitions=25)
    dd.assert_eq(dd.get_dummies(ddf).compute(), pd.get_dummies(df))
    gddf = dask_cudf.from_cudf(gdf, npartitions=25)
    dd.assert_eq(
        dd.get_dummies(ddf).compute(),
        dd.get_dummies(gddf).compute(),
        check_dtype=False,
    )
Esempio n. 16
0
def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples):
    X, ary = generate_inputs_from_categories(n_samples=n_samples,
                                             as_array=as_array)
    if as_array:
        dX = da.from_array(X)
    else:
        dX = dask_cudf.from_cudf(X, npartitions=1)

    enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto')
    ohe = enc.fit_transform(dX)
    ref = sk_enc.fit_transform(ary)
    if sparse:
        cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray())
    else:
        cp.testing.assert_array_equal(ohe.compute(), ref)

    inv_ohe = enc.inverse_transform(ohe)
    assert_inverse_equal(inv_ohe.compute(), dX.compute())
Esempio n. 17
0
def test_groupby_reset_index_names():
    df = cudf.datasets.randomdata(nrows=10,
                                  dtypes={
                                      "a": str,
                                      "b": int,
                                      "c": int
                                  })
    pdf = df.to_pandas()

    gddf = dask_cudf.from_cudf(df, 2)
    pddf = dd.from_pandas(pdf, 2)

    g_res = gddf.groupby("a", sort=True).sum()
    p_res = pddf.groupby("a", sort=True).sum()

    got = g_res.reset_index().compute().sort_values(["a", "b", "c"])
    expect = p_res.reset_index().compute().sort_values(["a", "b", "c"])

    dd.assert_eq(got, expect)
Esempio n. 18
0
def test_target_encode(tmpdir, cat_groups, kfold, fold_seed, cpu):
    df = cudf.DataFrame({
        "Author": list(string.ascii_uppercase),
        "Engaging-User": list(string.ascii_lowercase),
        "Cost": range(26),
        "Post": [0, 1] * 13,
    })
    if cpu:
        df = dd.from_pandas(df.to_pandas(), npartitions=3)
    else:
        df = dask_cudf.from_cudf(df, npartitions=3)

    cont_names = ["Cost"]
    te_features = cat_groups >> ops.TargetEncoding(
        cont_names,
        out_path=str(tmpdir),
        kfold=kfold,
        out_dtype="float32",
        fold_seed=fold_seed,
        drop_folds=False,  # Keep folds to validate
    )

    cont_features = cont_names >> ops.FillMissing() >> ops.Clip(
        min_value=0) >> ops.LogOp()
    workflow = nvt.Workflow(te_features + cont_features +
                            ["Author", "Engaging-User"])
    df_out = workflow.fit_transform(
        nvt.Dataset(df)).to_ddf().compute(scheduler="synchronous")

    if kfold > 1:
        # Cat columns are unique.
        # Make sure __fold__ mapping is correct
        if cat_groups == "Author":
            name = "__fold___Author"
            cols = ["__fold__", "Author"]
        else:
            name = "__fold___Author_Engaging-User"
            cols = ["__fold__", "Author", "Engaging-User"]
        check = cudf.io.read_parquet(te_features.op.stats[name])
        check = check[cols].sort_values(cols).reset_index(drop=True)
        df_out_check = df_out[cols].sort_values(cols).reset_index(drop=True)
        assert_eq(check, df_out_check)
Esempio n. 19
0
def test_groupby_split_out(split_out, column):
    df = pd.DataFrame({
        "a":
        np.arange(8),
        "b": [1, 0, 0, 2, 1, 1, 2, 0],
        "c": [0, 1] * 4,
        "d": ["dog", "cat", "cat", "dog", "dog", "dog", "cat", "bird"],
    })
    df["e"] = df["d"].astype("category")
    gdf = cudf.from_pandas(df)

    ddf = dd.from_pandas(df, npartitions=3)
    gddf = dask_cudf.from_cudf(gdf, npartitions=3)

    ddf_result = (ddf.groupby(column).a.mean(
        split_out=split_out).compute().sort_values().dropna())
    gddf_result = (gddf.groupby(column).a.mean(
        split_out=split_out).compute().sort_values())

    dd.assert_eq(gddf_result, ddf_result, check_index=False)
Esempio n. 20
0
def test_sort_values_binned():
    np.random.seed(43)
    nelem = 100
    nparts = 5
    by = 'a'
    df = gd.DataFrame()
    df['a'] = np.random.randint(1, 5, nelem)
    ddf = dgd.from_cudf(df, npartitions=nparts)

    parts = ddf.sort_values_binned(by=by).to_delayed()
    part_uniques = []
    for i, p in enumerate(parts):
        part = dask.compute(p)[0]
        part_uniques.append(set(part.a.unique()))

    # Partitions do not have intersecting keys
    for i in range(len(part_uniques)):
        for j in range(i + 1, len(part_uniques)):
            assert not (part_uniques[i] & part_uniques[j]), \
                    "should have empty intersection"
Esempio n. 21
0
def test_workflow_transform_ddf_dtypes():
    # Initial Dataset
    df = cudf.datasets.timeseries().reset_index()
    ddf = dask_cudf.from_cudf(df, npartitions=2)
    dataset = Dataset(ddf)

    # Create and Execute Workflow
    cols = ["name", "x", "y", "timestamp"]
    cat_cols = ["id"] >> ops.Normalize()
    workflow = Workflow(cols + cat_cols)
    workflow.fit(dataset)
    transformed_ddf = workflow.transform(dataset).to_ddf()

    # no transforms on the pass through cols, should have original dtypes
    for col in cols:
        assert_eq(ddf.dtypes[col], transformed_ddf.dtypes[col])

    # Followup dask-cudf sorting used to throw an exception because of dtype issues,
    # check that it works now
    transformed_ddf.sort_values(["id", "timestamp"]).compute()
Esempio n. 22
0
def test_groupby_reset_index_string_name():
    df = cudf.DataFrame({"value": range(5), "key": ["a", "a", "b", "a", "c"]})
    pdf = df.to_pandas()

    gddf = dask_cudf.from_cudf(df, npartitions=1)
    pddf = dd.from_pandas(pdf, npartitions=1)

    g_res = (gddf.groupby(["key"]).agg({
        "value": "mean"
    }).reset_index(drop=False))
    p_res = (pddf.groupby(["key"]).agg({
        "value": "mean"
    }).reset_index(drop=False))

    got = g_res.compute().sort_values(["key", "value"]).reset_index(drop=True)
    expect = (p_res.compute().sort_values(["key",
                                           "value"]).reset_index(drop=True))

    dd.assert_eq(got, expect)
    assert len(g_res) == len(p_res)
Esempio n. 23
0
def test_to_sp_dask_array(input_type, nrows, ncols, cluster):

    c = Client(cluster)

    try:

        from cuml.dask.common import to_sp_dask_array

        a = cp.sparse.random(nrows, ncols, format='csr', dtype=cp.float32)
        if input_type == "dask_dataframe":
            df = cudf.DataFrame.from_gpu_matrix(a.todense())
            inp = dask_cudf.from_cudf(df, npartitions=2)
        elif input_type == "dask_array":
            inp = dask.array.from_array(a.todense().get())
        elif input_type == "dataframe":
            inp = cudf.DataFrame.from_gpu_matrix(a.todense())
        elif input_type == "scipysparse":
            inp = a.get()
        elif input_type == "cupysparse":
            inp = a
        elif input_type == "numpy":
            inp = a.get().todense()
        elif input_type == "cupy":
            inp = a.todense()

        arr = to_sp_dask_array(inp, c)
        arr.compute_chunk_sizes()

        assert arr.shape == (nrows, ncols)

        # We can't call compute directly on this array yet when it has
        # multiple partitions yet so we will manually concat any
        # potential pieces.
        parts = c.sync(extract_arr_partitions, arr)
        local_parts = cp.vstack([part[1].result().todense()
                                 for part in parts]).get()

        assert array_equal(a.todense().get(), local_parts)

    finally:
        c.close()
Esempio n. 24
0
def test_roundtrip_from_dask_partitioned(tmpdir, parts, daskcudf, metadata):
    tmpdir = str(tmpdir)

    df = pd.DataFrame()
    df["year"] = [2018, 2019, 2019, 2019, 2020, 2021]
    df["month"] = [1, 2, 3, 3, 3, 2]
    df["day"] = [1, 1, 1, 2, 2, 1]
    df["data"] = [0, 0, 0, 0, 0, 0]
    df.index.name = "index"
    if daskcudf:
        ddf2 = dask_cudf.from_cudf(cudf.from_pandas(df), npartitions=2)
        ddf2.to_parquet(tmpdir,
                        write_metadata_file=metadata,
                        partition_on=parts)
    else:
        ddf2 = dd.from_pandas(df, npartitions=2)
        ddf2.to_parquet(
            tmpdir,
            engine="pyarrow",
            write_metadata_file=metadata,
            partition_on=parts,
        )
    df_read = dd.read_parquet(tmpdir, engine="pyarrow")
    gdf_read = dask_cudf.read_parquet(tmpdir)

    # TODO: Avoid column selection after `CudfEngine`
    # can be aligned with dask/dask#6534
    columns = list(df_read.columns)
    assert set(df_read.columns) == set(gdf_read.columns)
    dd.assert_eq(
        df_read.compute(scheduler=dask.get)[columns],
        gdf_read.compute(scheduler=dask.get)[columns],
    )

    assert gdf_read.index.name == "index"

    # Check that we don't have uuid4 file names
    for _, _, files in os.walk(tmpdir):
        for fn in files:
            if not fn.startswith("_"):
                assert "part" in fn
Esempio n. 25
0
def test_mnmg():
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    n_workers = len(client.scheduler_info()['workers'])

    # Create and populate a GPU DataFrame
    df_float = cudf.DataFrame()
    df_float['0'] = [1.0, 2.0, 5.0]
    df_float['1'] = [4.0, 2.0, 1.0]
    df_float['2'] = [4., 2, 1]

    ddf_float = dask_cudf.from_cudf(df_float, npartitions=2*n_workers)

    X = ddf_float[ddf_float.columns.difference(['2'])]
    y = ddf_float['2']
    mod = LinearRegression()
    mod = mod.fit(X, y)

    actual_output = str(mod.predict(X).compute().values)
    expected_output = '[4. 2. 1.]'
    assert actual_output == expected_output
Esempio n. 26
0
def test_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=n),
        "y": np.random.normal(size=n)
    })

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    # Combine with .append
    head = frags[0]
    tail = frags[1:]

    appended = dgd.from_cudf(head, npartitions=1)
    for each in tail:
        appended = appended.append(each)

    assert_frame_equal(df, appended.compute().to_pandas())
Esempio n. 27
0
def test_series_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=n),
        "y": np.random.normal(size=n)
    })

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    frags = [df.x for df in frags]

    appending = dgd.from_cudf(frags[0], npartitions=1)
    for frag in frags[1:]:
        appending = appending.append(frag)

    appended = appending.compute().to_pandas()
    assert isinstance(appended, pd.Series)
    np.testing.assert_array_equal(appended, df.x)
Esempio n. 28
0
def test_make_meta_backends(index):

    dtypes = ["int8", "int32", "int64", "float64"]
    df = cudf.DataFrame(
        {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}
    )
    df["strings"] = ["cat", "dog", "fish"]
    df["cats"] = df["strings"].astype("category")
    df["time_s"] = np.array(
        ["2018-10-07", "2018-10-08", "2018-10-09"], dtype="datetime64[s]"
    )
    df["time_ms"] = df["time_s"].astype("datetime64[ms]")
    df["time_ns"] = df["time_s"].astype("datetime64[ns]")
    df = df.set_index(index)
    ddf = dgd.from_cudf(df, npartitions=1)

    # Check "empty" metadata types
    dd.assert_eq(ddf._meta.dtypes, df.dtypes)

    # Check "non-empty" metadata types
    dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
Esempio n. 29
0
def test_groupby_apply():
    np.random.seed(0)

    nelem = 100
    xs = _gen_uniform_keys(nelem)
    ys = _gen_uniform_keys(nelem)
    df = pd.DataFrame({
        'x': xs,
        'y': ys,
        'idx': np.arange(nelem),
        'v1': np.random.normal(size=nelem),
        'v2': np.random.normal(size=nelem)
    })

    gdf = gd.DataFrame.from_pandas(df)
    dgf = dgd.from_cudf(gdf, npartitions=2)

    def transform(df):
        df['out1'] = df.y * (df.v1 + df.v2)
        return df

    grouped = dgf.groupby(by=['x', 'y']).apply(transform)

    # Compute with dask
    dgd_grouped = grouped.compute().to_pandas()
    binning = {}
    for _, row in dgd_grouped.iterrows():
        binning[row.idx] = row

    # Emulate the operation with pandas
    pd_groupby = df.groupby(by=['x', 'y'], sort=True,
                            as_index=True).apply(transform)

    # Check the result
    for _, expect in pd_groupby.iterrows():
        got = binning[expect.idx]

        attrs = ['x', 'y', 'v1', 'v2', 'out1']
        for a in attrs:
            np.testing.assert_equal(getattr(got, a), getattr(expect, a))
Esempio n. 30
0
def test_groupby_basic_aggs(agg):
    pdf = pd.DataFrame({
        "x": np.random.randint(0, 5, size=10000),
        "y": np.random.normal(size=10000),
    })

    gdf = cudf.DataFrame.from_pandas(pdf)

    ddf = dask_cudf.from_cudf(gdf, npartitions=5)

    a = getattr(gdf.groupby("x"), agg)().to_pandas()
    b = getattr(ddf.groupby("x"), agg)().compute().to_pandas()

    a.index.name = None
    a.name = None
    b.index.name = None
    b.name = None

    if agg == "count":
        a["y"] = a["y"].astype(np.int64)

    dd.assert_eq(a, b)