grouped_df = grouped_df[ grouped_df["cnt"] > q01_viewed_together_count].reset_index(drop=True) ### 2017 rows after filteration at sf-100 ### should scale till sf-100k grouped_df = grouped_df.repartition(npartitions=1).persist() ## converting to strings because of issue # https://github.com/rapidsai/gpu-bdb/issues/36 grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("str") grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("str") grouped_df = grouped_df.map_partitions(lambda df: df.sort_values( by=["cnt", "item_sk_1", "item_sk_2"], ascending=[False, True, True])) grouped_df = grouped_df.reset_index(drop=True) ### below is just 100 rows so should fit on `cudf` context grouped_df = grouped_df.head(q01_limit) ### writing to int to ensure same values grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("int32") grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("int32") return grouped_df if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster import cudf import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) run_query(config=config, client=client, query_func=main)
WHERE wcs_item_sk = {q02_item_sk} ) SELECT sd.wcs_item_sk as item_sk_1, count(sd.wcs_item_sk) as cnt FROM session_df sd INNER JOIN item_df id ON sd.wcs_user_sk = id.wcs_user_sk AND sd.session_id = id.session_id AND sd.wcs_item_sk <> {q02_item_sk} GROUP BY sd.wcs_item_sk ORDER BY cnt desc LIMIT {q02_limit} """ result = bc.sql(last_query) result["item_sk_2"] = q02_item_sk result_order = ["item_sk_1", "item_sk_2", "cnt"] result = result[result_order] del session_df bc.drop_table("session_df") return result if __name__ == "__main__": config = gpubdb_argparser() client, bc = attach_to_cluster(config, create_blazing_context=True) run_query(config=config, client=client, query_func=main, blazing_context=bc)