def main(client): web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df = read_tables( ) warehouse_df["w_state_code"] = warehouse_df[["w_state" ]].categorize()["w_state"] item_df["i_item_id_code"] = item_df[["i_item_id" ]].categorize()["i_item_id"] ## persisting as you need it for length calculation and to prevent duplicate reading ## downstream warehouse_df = warehouse_df.persist() item_df = item_df.persist() ## casting down because of dtype incosistieny in cudf/dask due to cat columns ### https://github.com/rapidsai/cudf/issues/4093 wh_df_codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(warehouse_df["w_state_code"].compute().cat.categories)) warehouse_df["w_state_code"] = warehouse_df[ "w_state_code"].cat.codes.astype(wh_df_codes_min_signed_type) unique_states = warehouse_df[["w_state_code", "w_state"]].drop_duplicates() warehouse_df = warehouse_df[["w_state_code", "w_warehouse_sk"]] ## casting down because of dtype incosistieny in cudf/dask due to cat columns ### https://github.com/rapidsai/cudf/issues/4093 item_df_codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(item_df["i_item_id_code"].compute().cat.categories)) item_df["i_item_id_code"] = item_df["i_item_id_code"].cat.codes.astype( item_df_codes_min_signed_type) unique_items = item_df[["i_item_id_code", "i_item_id"]].drop_duplicates() item_df = item_df[["i_item_id_code", "i_item_sk"]] # JOIN date_dim d ON a1.ws_sold_date_sk = d.d_date_sk # AND unix_timestamp(d.d_date, 'yyyy-MM-dd') >= unix_timestamp('${hiveconf:q16_date}', 'yyyy-MM-dd') - 30*24*60*60 --subtract 30 days in seconds # AND unix_timestamp(d.d_date, 'yyyy-MM-dd') <= unix_timestamp('${hiveconf:q16_date}', 'yyyy-MM-dd') + 30*24*60*60 --add 30 days in seconds ##todo: remove below date_dim_cov_df = date_dim_df.map_partitions( convert_datestring_to_datetime) q16_timestamp = np.datetime64(q16_date, "D").astype(int) filtered_date_df = date_dim_cov_df.query( f"d_date >={q16_timestamp- 30} and d_date <= {q16_timestamp+30}", meta=date_dim_cov_df._meta, ).reset_index(drop=True) web_sales_df = web_sales_df.merge( filtered_date_df, left_on=["ws_sold_date_sk"], right_on=["d_date_sk"], how="inner", ) cols_2_keep = [ "ws_order_number", "ws_item_sk", "ws_warehouse_sk", "ws_sales_price", "d_date", ] web_sales_df = web_sales_df[cols_2_keep] web_sales_df = web_sales_df.persist() wait(web_sales_df) # SELECT * # FROM web_sales ws # LEFT OUTER JOIN web_returns wr ON (ws.ws_order_number = wr.wr_order_number # AND ws.ws_item_sk = wr.wr_item_sk) # ) a1 web_sales_web_returns_join = hash_merge( lhs=web_sales_df, rhs=web_returns_df, left_on=["ws_order_number", "ws_item_sk"], right_on=["wr_order_number", "wr_item_sk"], how="left", ) cols_2_keep = [ "ws_item_sk", "ws_warehouse_sk", "ws_sales_price", "wr_refunded_cash", "d_date", ] web_sales_web_returns_join = web_sales_web_returns_join[cols_2_keep] web_sales_web_returns_join = web_sales_web_returns_join.persist() wait(web_sales_web_returns_join) del web_sales_df # JOIN item i ON a1.ws_item_sk = i.i_item_sk web_sales_web_returns_item_join = web_sales_web_returns_join.merge( item_df, left_on=["ws_item_sk"], right_on=["i_item_sk"], how="inner") cols_2_keep = [ "ws_warehouse_sk", "ws_sales_price", "wr_refunded_cash", "i_item_id_code", "d_date", ] web_sales_web_returns_item_join = web_sales_web_returns_item_join[ cols_2_keep] # JOIN warehouse w ON a1.ws_warehouse_sk = w.w_warehouse_sk web_sales_web_returns_item_warehouse_join = web_sales_web_returns_item_join.merge( warehouse_df, left_on=["ws_warehouse_sk"], right_on=["w_warehouse_sk"], how="inner", ) merged_df = web_sales_web_returns_item_warehouse_join[[ "ws_sales_price", "wr_refunded_cash", "i_item_id_code", "w_state_code", "d_date", ]] merged_df["sales"] = web_sales_web_returns_item_warehouse_join[ "ws_sales_price"].fillna( 0) - web_sales_web_returns_item_warehouse_join[ "wr_refunded_cash"].fillna(0) sales_df = merged_df[["i_item_id_code", "w_state_code", "d_date", "sales"]] sales_before_after_df = sales_df.map_partitions(get_before_after_sales, q16_timestamp) cols_2_keep = [ "i_item_id_code", "w_state_code", "sales_before", "sales_after" ] sales_before_after_df = sales_before_after_df[cols_2_keep] ## group by logic group_cols = ["w_state_code", "i_item_id_code"] agg_df = sales_before_after_df.groupby(group_cols).agg({ "sales_before": "sum", "sales_after": "sum" }) agg_df = agg_df.reset_index(drop=False) agg_df = agg_df.loc[:99].persist() agg_df = agg_df.reset_index(drop=False) agg_df.columns = [ "sorted_grp_index", "w_state_code", "i_item_id_code", "sales_before", "sales_after", ] agg_df = agg_df.merge(unique_states, how="left", on="w_state_code")[[ "sorted_grp_index", "w_state", "i_item_id_code", "sales_before", "sales_after" ]] agg_df = agg_df.merge(unique_items, how="left", on="i_item_id_code")[[ "sorted_grp_index", "w_state", "i_item_id", "sales_before", "sales_after" ]] agg_df = agg_df.sort_values(by=["sorted_grp_index"]) ## only 100 rows so computing is fine return agg_df[["w_state", "i_item_id", "sales_before", "sales_after"]].compute()
def main(client, config): import cudf import dask_cudf (date_dim_df, web_page_df, web_sales_df) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days) q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int) q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int) filtered_date_df = date_dim_cov_df.query( f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}", meta=date_dim_cov_df._meta, ).reset_index(drop=True) # Convert wp_type to categorical and get cat_id of review and dynamic type # see https://github.com/rapidsai/cudf/issues/4093 for more info web_page_df = web_page_df.persist() # map_partitions is a bit faster than ddf[col].astype('category') web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions( lambda ser: ser.astype("category")) cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas( ) REVIEW_CAT_CODE = cpu_categories.get_loc("review") # cast to minimum viable dtype codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(cpu_categories)) web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype( codes_min_signed_type) web_page_newcols = ["wp_web_page_sk", "wp_type_codes"] web_page_df = web_page_df[web_page_newcols] web_clickstream_flist = glob.glob(config["data_dir"] + "web_clickstreams/*.parquet") task_ls = [ delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0], web_page_df.to_delayed()[0]) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int64), "tstamp_inSec": np.ones(1, dtype=np.int64), "wcs_sales_sk": np.ones(1, dtype=np.int64), "wp_type_codes": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.repartition(columns=["wcs_user_sk"]) reviewed_sales = merged_df.map_partitions( reduction_function, REVIEW_CAT_CODE, meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}), ) reviewed_sales = reviewed_sales.persist() wait(reviewed_sales) del merged_df all_sales_in_year = filtered_date_df.merge(web_sales_df, left_on=["d_date_sk"], right_on=["ws_sold_date_sk"], how="inner") all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]] all_sales_in_year = all_sales_in_year.persist() wait(all_sales_in_year) # note: switch to mainline # once https://github.com/dask/dask/pull/6066 # lands q08_reviewed_sales = hash_merge( lhs=all_sales_in_year, rhs=reviewed_sales, left_on=["ws_order_number"], right_on=["wcs_sales_sk"], how="inner", ) q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum() q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum() q08_reviewed_sales_sum, q08_all_sales_sum = client.compute( [q08_reviewed_sales_sum, q08_all_sales_sum]) q08_reviewed_sales_sum, q08_all_sales_sum = ( q08_reviewed_sales_sum.result(), q08_all_sales_sum.result(), ) no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum final_result_df = cudf.DataFrame() final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum] final_result_df["q08_review_sales_amount"] = final_result_df[ "q08_review_sales_amount"].astype("int") final_result_df["no_q08_review_sales_amount"] = [ no_q08_review_sales_amount ] final_result_df["no_q08_review_sales_amount"] = final_result_df[ "no_q08_review_sales_amount"].astype("int") return final_result_df
def main(client, config): ( store_sales_df, date_dim_df, web_sales_df, store_retuns_df, store_table_df, item_table_df, ) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) # SELECT sr_item_sk, sr_customer_sk, sr_ticket_number, sr_return_quantity # FROM # store_returns sr, # date_dim d2 # WHERE d2.d_year = ${hiveconf:q21_year} # AND d2.d_moy BETWEEN ${hiveconf:q21_month} AND ${hiveconf:q21_month} + 6 --which were returned in the next six months # AND sr.sr_returned_date_sk = d2.d_date_sk d2 = date_dim_df.query( f"d_year == {q21_year} and d_moy >= {q21_month} and d_moy <= {q21_month+6}", meta=date_dim_df._meta, ).reset_index(drop=True) part_sr = store_retuns_df.merge(d2, left_on="sr_returned_date_sk", right_on="d_date_sk", how="inner") cols_2_keep = [ "sr_item_sk", "sr_customer_sk", "sr_ticket_number", "sr_return_quantity", ] part_sr = part_sr[cols_2_keep] part_sr = part_sr.persist() wait(part_sr) # SELECT # ws_item_sk, ws_bill_customer_sk, ws_quantity # FROM # web_sales ws, # date_dim d3 # WHERE d3.d_year BETWEEN ${hiveconf:q21_year} AND ${hiveconf:q21_year} + 2 -- in the following three years (re-purchased by the returning customer afterwards through # the web sales channel) # AND ws.ws_sold_date_sk = d3.d_date_sk # ) part_ws d3 = date_dim_df.query( f"d_year >= {q21_year} and d_year <= {q21_year + 2}", meta=date_dim_df._meta) part_ws = web_sales_df.merge(d3, left_on="ws_sold_date_sk", right_on="d_date_sk", how="inner") cols_2_keep = ["ws_item_sk", "ws_bill_customer_sk", "ws_quantity"] part_ws = part_ws[cols_2_keep] part_ws = part_ws.persist() wait(part_ws) # part_ws ON ( # part_sr.sr_item_sk = part_ws.ws_item_sk # AND part_sr.sr_customer_sk = part_ws.ws_bill_customer_sk part_ws_part_sr_m = hash_merge( lhs=part_sr, rhs=part_ws, left_on=["sr_item_sk", "sr_customer_sk"], right_on=["ws_item_sk", "ws_bill_customer_sk"], how="inner", ) cols_2_keep = [ "sr_item_sk", "sr_customer_sk", "sr_ticket_number", "sr_return_quantity", "ws_quantity", ] part_ws_part_sr_m = part_ws_part_sr_m[cols_2_keep] part_ws_part_sr_m = part_ws_part_sr_m.persist() wait(part_ws_part_sr_m) del part_sr, part_ws # SELECT ss_item_sk, ss_store_sk, ss_customer_sk, ss_ticket_number, ss_quantity # FROM # store_sales ss, # date_dim d1 # WHERE d1.d_year = ${hiveconf:q21_year} # AND d1.d_moy = ${hiveconf:q21_month} # AND ss.ss_sold_date_sk = d1.d_date_sk # ) part_ss d1 = date_dim_df.query(f"d_year == {q21_year} and d_moy == {q21_month} ", meta=date_dim_df._meta) part_ss = store_sales_df.merge(d1, left_on="ss_sold_date_sk", right_on="d_date_sk", how="inner") cols_2_keep = [ "ss_item_sk", "ss_store_sk", "ss_customer_sk", "ss_ticket_number", "ss_quantity", ] part_ss = part_ss[cols_2_keep] # part_ss ON ( # part_ss.ss_ticket_number = part_sr.sr_ticket_number # AND part_ss.ss_item_sk = part_sr.sr_item_sk # AND part_ss.ss_customer_sk = part_sr.sr_customer_sk part_ws_part_sr_m_part_ss_join_df = hash_merge( lhs=part_ss, rhs=part_ws_part_sr_m, left_on=["ss_ticket_number", "ss_item_sk", "ss_customer_sk"], right_on=["sr_ticket_number", "sr_item_sk", "sr_customer_sk"], how="inner", ) cols_2_keep = [ "ss_store_sk", "ss_quantity", "sr_return_quantity", "ws_quantity", "ss_item_sk", ] part_ws_part_sr_m_part_ss_join_df = part_ws_part_sr_m_part_ss_join_df[ cols_2_keep] # INNER JOIN store part_s ON ( # part_s.s_store_sk = part_ss.ss_store_sk # ) part_ws_part_sr_m_part_ss_part_s_join_df = store_table_df.merge( part_ws_part_sr_m_part_ss_join_df, left_on="s_store_sk", right_on="ss_store_sk", how="inner", ) cols_2_keep = [ "s_store_name", "sr_return_quantity", "ss_quantity", "ws_quantity", "s_store_id", "ss_item_sk", ] part_ws_part_sr_m_part_ss_part_s_join_df = part_ws_part_sr_m_part_ss_part_s_join_df[ cols_2_keep] # INNER JOIN item part_i ON ( # part_i.i_item_sk = part_ss.ss_item_sk # ) final_df = item_table_df.merge( part_ws_part_sr_m_part_ss_part_s_join_df, left_on="i_item_sk", right_on="ss_item_sk", how="inner", ) # GROUP BY # part_i.i_item_id, # part_i.i_item_desc, # part_s.s_store_id, # part_s.s_store_name # ORDER BY # part_i.i_item_id, # part_i.i_item_desc, # part_s.s_store_id, # part_s.s_store_name cols_2_keep = [ "i_item_id", "i_item_desc", "s_store_name", "ss_quantity", "sr_return_quantity", "ws_quantity", "s_store_id", ] grouped_df = final_df[cols_2_keep] agg_df = grouped_df.groupby( by=["i_item_id", "i_item_desc", "s_store_id", "s_store_name"]).agg({ "ss_quantity": "sum", "sr_return_quantity": "sum", "ws_quantity": "sum" }) agg_df = agg_df.repartition(npartitions=1).persist() sorted_agg_df = agg_df.reset_index().map_partitions( lambda df: df.sort_values( by=["i_item_id", "i_item_desc", "s_store_id", "s_store_name"])) sorted_agg_df = sorted_agg_df.head(q21_limit) sorted_agg_df = sorted_agg_df.rename( columns={ "ss_quantity": "store_sales_quantity", "sr_return_quantity": "store_returns_quantity", "ws_quantity": "web_sales_quantity", }) sorted_agg_df["i_item_desc"] = sorted_agg_df["i_item_desc"].str.strip() return sorted_agg_df