예제 #1
0
            WHERE wcs_item_sk = {q02_item_sk}
        )
        SELECT sd.wcs_item_sk as item_sk_1,
            count(sd.wcs_item_sk) as cnt
        FROM session_df sd
        INNER JOIN item_df id
        ON sd.wcs_user_sk = id.wcs_user_sk
        AND sd.session_id = id.session_id
        AND sd.wcs_item_sk <> {q02_item_sk}
        GROUP BY sd.wcs_item_sk
        ORDER BY cnt desc
        LIMIT {q02_limit}
    """
    result = bc.sql(last_query)
    result["item_sk_2"] = q02_item_sk
    result_order = ["item_sk_1", "item_sk_2", "cnt"]
    result = result[result_order]

    del session_df
    bc.drop_table("session_df")
    return result


if __name__ == "__main__":
    config = gpubdb_argparser()
    client, bc = attach_to_cluster(config, create_blazing_context=True)
    run_query(config=config,
              client=client,
              query_func=main,
              blazing_context=bc)
예제 #2
0
    grouped_df = grouped_df[
        grouped_df["cnt"] > q01_viewed_together_count].reset_index(drop=True)

    ### 2017 rows after filteration at sf-100
    ### should scale till sf-100k
    grouped_df = grouped_df.repartition(npartitions=1).persist()
    ## converting to strings because of issue
    # https://github.com/rapidsai/gpu-bdb/issues/36

    grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("str")
    grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("str")
    grouped_df = grouped_df.map_partitions(lambda df: df.sort_values(
        by=["cnt", "item_sk_1", "item_sk_2"], ascending=[False, True, True]))
    grouped_df = grouped_df.reset_index(drop=True)
    ### below is just 100 rows so should fit on `cudf` context
    grouped_df = grouped_df.head(q01_limit)
    ### writing to int to ensure same values
    grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("int32")
    grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("int32")
    return grouped_df


if __name__ == "__main__":
    from bdb_tools.cluster_startup import attach_to_cluster
    import cudf
    import dask_cudf

    config = gpubdb_argparser()
    client, bc = attach_to_cluster(config)
    run_query(config=config, client=client, query_func=main)
예제 #3
0
    if include_blazing and len(bsql_qnums) > 0:
        print("Blazing Queries")
        for qnum, q_func in bsql_queries.items():
            print(qnum)

            qpath = f"{base_path}/queries/q{qnum}/"
            os.chdir(qpath)
            if os.path.exists("current_query_num.txt"):
                os.remove("current_query_num.txt")
            with open("current_query_num.txt", "w") as fp:
                fp.write(qnum)

            for r in range(N_REPEATS):
                run_query(
                    config=config,
                    client=client,
                    query_func=q_func,
                    blazing_context=bc,
                )
                client.run(gc.collect)
                client.run_on_scheduler(gc.collect)
                gc.collect()
                time.sleep(3)

    # Run Pure Dask Queries
    if len(dask_qnums) > 0:
        print("Pure Dask Queries")
        for qnum, q_func in dask_queries.items():
            print(qnum)

            qpath = f"{base_path}/queries/q{qnum}/"
            os.chdir(qpath)