Esempio n. 1
0
def concat(dfs: List[DataframeLike], engine: Engine):

    if engine == Engine.PANDAS:
        return pd.concat(dfs, ignore_index=True, sort=False)

    if engine == Engine.DASK:
        import dask.dataframe
        return dask.dataframe.concat(dfs).reset_index(drop=True)

    if engine == Engine.CUDF:
        import cudf
        try:
            return cudf.concat(dfs, ignore_index=True)
        except TypeError as e:
            logger.warning(
                'Failed to concat, likely due to column type issue, try converting to a string; columns'
            )
            for df in dfs:
                logger.warning('df types :: %s', df.dtypes)
            raise e

    if engine == Engine.DASK:
        import dask.dataframe as dd
        return dd.concat(dfs)

    if engine == Engine.DASK_CUDF:
        import dask_cudf
        return dask_cudf.concat(dfs)

    raise NotImplementedError('Unknown engine')
Esempio n. 2
0
    def test_empty_partition(self,
                             local_cuda_cluster: LocalCUDACluster) -> None:
        import dask_cudf
        import cudf
        import cupy
        with Client(local_cuda_cluster) as client:
            mult = 100
            df = cudf.DataFrame({
                "a": [1, 2, 3, 4, 5.1] * mult,
                "b": [10, 15, 29.3, 30, 31] * mult,
                "y": [10, 20, 30, 40., 50] * mult,
            })
            parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}

            empty = df.iloc[:0]
            ddf = dask_cudf.concat(
                [dask_cudf.from_cudf(empty, npartitions=1)] +
                [dask_cudf.from_cudf(df, npartitions=3)] +
                [dask_cudf.from_cudf(df, npartitions=3)])
            X = ddf[ddf.columns.difference(["y"])]
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst_empty = xgb.dask.train(client,
                                       parameters,
                                       dtrain,
                                       evals=[(dtrain, "train")])
            predt_empty = dxgb.predict(client, bst_empty, X).compute().values

            ddf = dask_cudf.concat([dask_cudf.from_cudf(df, npartitions=3)] +
                                   [dask_cudf.from_cudf(df, npartitions=3)])
            X = ddf[ddf.columns.difference(["y"])]
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst = xgb.dask.train(client,
                                 parameters,
                                 dtrain,
                                 evals=[(dtrain, "train")])
            predt = dxgb.predict(client, bst, X).compute().values

            cupy.testing.assert_allclose(predt, predt_empty)
Esempio n. 3
0
def concat(dfs: List[DataframeLike], engine: Engine, debug=False):

    if debug and len(dfs) > 1:
        df0 = dfs[0]
        for c in df0:
            logger.debug('checking df0: %s :: %s', c, df0[c].dtype)
            for df_i in dfs[1:]:
                if c not in df_i:
                    logger.warning('missing df0[%s]::%s in df_i', c,
                                   df0[c].dtype)
                if df0[c].dtype != df_i[c].dtype:
                    logger.warning(
                        'mismatching df0[c]::%s vs df_i[c]::%s for %s',
                        df0[c].dtype, df_i[c].dtype, c)
        for df_i in dfs[1:]:
            for c in df_i:
                logger.debug('checking df_i: %s', c)
                if c not in df0:
                    logger.warning('missing df_i[%s]::%s in df0', c,
                                   df_i[c].dtype)
        logger.debug('all checked!')

    if engine == Engine.PANDAS:
        return pd.concat(dfs, ignore_index=True, sort=False)

    if engine == Engine.DASK:
        import dask.dataframe
        return dask.dataframe.concat(dfs).reset_index(drop=True)

    if engine == Engine.CUDF:
        import cudf
        try:
            return cudf.concat(dfs, ignore_index=True)
        except TypeError as e:
            logger.warning(
                'Failed to concat, likely due to column type issue, try converting to a string; columns'
            )
            for df in dfs:
                logger.warning('df types :: %s', df.dtypes)
            raise e

    if engine == Engine.DASK:
        import dask.dataframe as dd
        return dd.concat(dfs)

    if engine == Engine.DASK_CUDF:
        import dask_cudf
        return dask_cudf.concat(dfs)

    raise NotImplementedError('Unknown engine')
Esempio n. 4
0
def test_concat():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=n),
        "y": np.random.normal(size=n)
    })

    gdf = cudf.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    # Combine with concat
    concated = dgd.concat(frags)
    assert_frame_equal(df, concated)
Esempio n. 5
0
def test_series_concat():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({
        "x": np.random.randint(0, 5, size=n),
        "y": np.random.normal(size=n)
    })

    gdf = cudf.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    frags = [df.x for df in frags]

    concated = dgd.concat(frags).compute().to_pandas()
    assert isinstance(concated, pd.Series)
    np.testing.assert_array_equal(concated, df.x)
Esempio n. 6
0
def main(client):
    import dask_cudf

    ss_ddf, ws_ddf, datedim_ddf = read_tables()
    datedim_ddf = datedim_ddf.map_partitions(convert_datestring_to_days)
    min_date = np.datetime64(Q25_DATE, "D").astype(int)
    # Filter by date
    valid_dates_ddf = datedim_ddf[
        datedim_ddf["d_date"] > min_date].reset_index(drop=True)

    f_ss_ddf = ss_ddf[ss_ddf["ss_customer_sk"].notnull()].reset_index(
        drop=True)
    f_ws_ddf = ws_ddf[ws_ddf["ws_bill_customer_sk"].notnull()].reset_index(
        drop=True)

    # Merge
    ss_merged_df = f_ss_ddf.merge(valid_dates_ddf,
                                  left_on="ss_sold_date_sk",
                                  right_on="d_date_sk",
                                  how="inner")
    ws_merged_df = f_ws_ddf.merge(valid_dates_ddf,
                                  left_on="ws_sold_date_sk",
                                  right_on="d_date_sk",
                                  how="inner")

    # Roll up store sales
    agg_store_sales_ddf = ss_merged_df.groupby("ss_customer_sk").agg({
        "ss_sold_date_sk":
        "max",
        "ss_net_paid":
        "sum"
    })

    agg_store_sales_ddf["frequency"] = agg_count_distinct(
        ss_merged_df, "ss_customer_sk", "ss_ticket_number",
        client=client)  # Simulate count distinct

    # Same rollup, just different columns for web sales
    agg_web_sales_ddf = ws_merged_df.groupby("ws_bill_customer_sk").agg({
        "ws_sold_date_sk":
        "max",
        "ws_net_paid":
        "sum"
    })

    agg_web_sales_ddf["frequency"] = agg_count_distinct(
        ws_merged_df, "ws_bill_customer_sk", "ws_order_number",
        client=client)  # Simulate count distinct

    agg_store_sales_ddf = agg_store_sales_ddf.reset_index()
    agg_web_sales_ddf = agg_web_sales_ddf.reset_index()

    shared_columns = ["cid", "most_recent_date", "amount", "frequency"]
    agg_store_sales_ddf.columns = shared_columns
    agg_web_sales_ddf.columns = shared_columns
    agg_sales_ddf = dask_cudf.concat([agg_store_sales_ddf, agg_web_sales_ddf])

    cluster_input_ddf = agg_sales_ddf.groupby("cid").agg({
        "most_recent_date": "max",
        "frequency": "sum",
        "amount": "sum"
    })

    cluster_input_ddf["recency"] = (37621 -
                                    cluster_input_ddf["most_recent_date"]) < 60

    # Reorder to match refererence examples
    cluster_input_ddf = cluster_input_ddf[["recency", "frequency", "amount"]]

    # Prepare df for KMeans clustering
    cluster_input_ddf["recency"] = cluster_input_ddf["recency"].astype("int64")
    cluster_input_ddf["amount"] = cluster_input_ddf["amount"].astype("float64")

    cluster_input_ddf = cluster_input_ddf.persist()

    results_dict = get_clusters(client=client, ml_input_df=cluster_input_ddf)
    return results_dict
Esempio n. 7
0
 def concat(self, dfs, **kwargs):
     return dask_cudf.concat(dfs, **kwargs)