def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_distinct = """ SELECT DISTINCT i_category_id, ws_order_number FROM web_sales ws, item i WHERE ws.ws_item_sk = i.i_item_sk AND i.i_category_id IS NOT NULL """ result_distinct = bc.sql(query_distinct) result_distinct = result_distinct.persist() wait(result_distinct) bc.create_table('distinct_table', result_distinct) query = f""" SELECT category_id_1, category_id_2, COUNT (*) AS cnt FROM ( SELECT CAST(t1.i_category_id as BIGINT) AS category_id_1, CAST(t2.i_category_id as BIGINT) AS category_id_2 FROM distinct_table t1 INNER JOIN distinct_table t2 ON t1.ws_order_number = t2.ws_order_number WHERE t1.i_category_id < t2.i_category_id ) GROUP BY category_id_1, category_id_2 ORDER BY cnt DESC, category_id_1, category_id_2 LIMIT {q29_limit} """ result = bc.sql(query) bc.drop_table("distinct_table") return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = """ WITH p AS ( SELECT pr_item_sk, count(pr_item_sk) AS r_count, AVG( CAST(pr_review_rating AS DOUBLE) ) avg_rating FROM product_reviews WHERE pr_item_sk IS NOT NULL GROUP BY pr_item_sk ), s AS ( SELECT ws_item_sk FROM web_sales ws INNER JOIN date_dim d ON ws.ws_sold_date_sk = d.d_date_sk WHERE ws_item_sk IS NOT null AND CAST(d.d_date AS DATE) >= DATE '2003-01-02' AND CAST(d.d_date AS DATE) <= DATE '2003-02-02' GROUP BY ws_item_sk ) SELECT p.r_count AS x, p.avg_rating AS y FROM s INNER JOIN p ON p.pr_item_sk = s.ws_item_sk """ result = bc.sql(query) sales_corr = result["x"].corr(result["y"]).compute() result_df = cudf.DataFrame([sales_corr]) result_df.columns = ["corr(CAST(reviews_count AS DOUBLE), avg_rating)"] return result_df
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = f""" SELECT * FROM ( SELECT cat, ( (count(x) * SUM(xy) - SUM(x) * SUM(y)) / (count(x) * SUM(xx) - SUM(x) * SUM(x)) ) AS slope, (SUM(y) - ((count(x) * SUM(xy) - SUM(x) * SUM(y)) / (count(x) * SUM(xx) - SUM(x)*SUM(x)) ) * SUM(x)) / count(x) AS intercept FROM ( SELECT i.i_category_id AS cat, s.ss_sold_date_sk AS x, CAST(SUM(s.ss_net_paid) AS DOUBLE) AS y, CAST(s.ss_sold_date_sk * SUM(s.ss_net_paid) AS DOUBLE) AS xy, CAST(s.ss_sold_date_sk * s.ss_sold_date_sk AS DOUBLE) AS xx FROM store_sales s INNER JOIN item i ON s.ss_item_sk = i.i_item_sk INNER JOIN date_dim d ON s.ss_sold_date_sk = d.d_date_sk WHERE s.ss_store_sk = {q15_store_sk} AND i.i_category_id IS NOT NULL AND CAST(d.d_date AS DATE) >= DATE '{q15_startDate}' AND CAST(d.d_date AS DATE) <= DATE '{q15_endDate}' GROUP BY i.i_category_id, s.ss_sold_date_sk ) temp GROUP BY cat ) regression WHERE slope <= 0.0 ORDER BY cat """ result = bc.sql(query) return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_distinct = f""" SELECT DISTINCT ss_item_sk, ss_ticket_number FROM store_sales s, item i WHERE s.ss_item_sk = i.i_item_sk AND i.i_category_id IN ({q01_i_category_id_IN}) AND s.ss_store_sk IN ({q01_ss_store_sk_IN}) """ result_distinct = bc.sql(query_distinct) bc.create_table("distinct_table", result_distinct) query = f""" SELECT item_sk_1, item_sk_2, COUNT(*) AS cnt FROM ( SELECT CAST(t1.ss_item_sk as BIGINT) AS item_sk_1, CAST(t2.ss_item_sk AS BIGINT) AS item_sk_2 FROM distinct_table t1 INNER JOIN distinct_table t2 ON t1.ss_ticket_number = t2.ss_ticket_number WHERE t1.ss_item_sk < t2.ss_item_sk ) GROUP BY item_sk_1, item_sk_2 HAVING COUNT(*) > {q01_viewed_together_count} ORDER BY cnt DESC, CAST(item_sk_1 AS VARCHAR), CAST(item_sk_2 AS VARCHAR) LIMIT {q01_limit} """ result = bc.sql(query) return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = f""" SELECT DISTINCT wcs_user_sk FROM ( SELECT DISTINCT wcs_user_sk, wcs_click_date_sk FROM web_clickstreams, item WHERE wcs_click_date_sk BETWEEN 37134 AND 37164 AND i_category IN ({q12_i_category_IN}) AND wcs_item_sk = i_item_sk AND wcs_user_sk IS NOT NULL AND wcs_sales_sk IS NULL ) webInRange, ( SELECT DISTINCT ss_customer_sk, ss_sold_date_sk FROM store_sales, item WHERE ss_sold_date_sk BETWEEN 37134 AND 37224 AND i_category IN ({q12_i_category_IN}) -- filter given category AND ss_item_sk = i_item_sk AND ss_customer_sk IS NOT NULL ) storeInRange WHERE wcs_user_sk = ss_customer_sk AND wcs_click_date_sk < ss_sold_date_sk ORDER BY wcs_user_sk """ result = bc.sql(query) return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) date = datetime.datetime(2001, 3, 16) start = (date + timedelta(days=-30)).strftime("%Y-%m-%d") end = (date + timedelta(days=30)).strftime("%Y-%m-%d") mid = date.strftime("%Y-%m-%d") date_query = f""" SELECT d_date_sk FROM date_dim WHERE CAST(d_date as DATE) IN (DATE '{start}', DATE '{mid}', DATE '{end}') ORDER BY CAST(d_date as date) ASC """ dates = bc.sql(date_query) cpu_dates = dates["d_date_sk"].compute().to_pandas() cpu_dates.index = list(range(0, cpu_dates.shape[0])) last_query = f""" SELECT w_state, i_item_id, SUM ( CASE WHEN ws_sold_date_sk < {str(cpu_dates[1])} THEN ws_sales_price - COALESCE(wr_refunded_cash,0) ELSE 0.0 END ) AS sales_before, SUM ( CASE WHEN ws_sold_date_sk >= {str(cpu_dates[1])} THEN ws_sales_price - COALESCE(wr_refunded_cash,0) ELSE 0.0 END ) AS sales_after FROM ( SELECT ws_item_sk, ws_warehouse_sk, ws_sold_date_sk, ws_sales_price, wr_refunded_cash FROM web_sales ws LEFT OUTER JOIN web_returns wr ON ( ws.ws_order_number = wr.wr_order_number AND ws.ws_item_sk = wr.wr_item_sk ) WHERE ws_sold_date_sk BETWEEN {str(cpu_dates[0])} AND {str(cpu_dates[2])} ) a1 JOIN item i ON a1.ws_item_sk = i.i_item_sk JOIN warehouse w ON a1.ws_warehouse_sk = w.w_warehouse_sk GROUP BY w_state,i_item_id ORDER BY w_state,i_item_id LIMIT 100 """ result = bc.sql(last_query) return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_1 = """ SELECT i_item_sk, CAST(i_category_id AS TINYINT) AS i_category_id FROM item """ item_df = bc.sql(query_1) item_df = item_df.persist() wait(item_df) bc.create_table("item_df", item_df) query_2 = """ SELECT wcs_user_sk, (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec, i_category_id FROM web_clickstreams wcs, item_df i WHERE wcs.wcs_item_sk = i.i_item_sk AND i.i_category_id IS NOT NULL AND wcs.wcs_user_sk IS NOT NULL ORDER BY wcs.wcs_user_sk, tstamp_inSec, i_category_id """ merged_df = bc.sql(query_2) bc.drop_table("item_df") del item_df distinct_session_df = merged_df.map_partitions( get_distinct_sessions, keep_cols=["wcs_user_sk", "i_category_id"], time_out=q30_session_timeout_inSec) del merged_df pair_df = distinct_session_df.map_partitions(get_pairs, pair_col="i_category_id", output_col_1="category_id_1", output_col_2="category_id_2") del distinct_session_df pair_df = pair_df.persist() wait(pair_df) bc.create_table('pair_df', pair_df) last_query = f""" SELECT CAST(category_id_1 AS BIGINT) AS category_id_1, CAST(category_id_2 AS BIGINT) AS category_id_2, COUNT(category_id_2) AS cnt FROM pair_df GROUP BY category_id_1, category_id_2 ORDER BY cnt desc LIMIT {q30_limit} """ result = bc.sql(last_query) bc.drop_table("pair_df") return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_web_page = """ SELECT wp_type, wp_web_page_sk FROM web_page_wo_categorical """ wp = bc.sql(query_web_page) # Convert wp_type to categorical and get cat_id of review and dynamic type wp["wp_type"] = wp["wp_type"].map_partitions( lambda ser: ser.astype("category")) cpu_categories = wp["wp_type"].compute().cat.categories.to_pandas() DYNAMIC_CAT_CODE = cpu_categories.get_loc("dynamic") ORDER_CAT_CODE = cpu_categories.get_loc("order") # ### cast to minimum viable dtype import cudf codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(cpu_categories)) wp["wp_type_codes"] = wp["wp_type"].cat.codes.astype(codes_min_signed_type) wp["wp_type"] = wp["wp_type"].cat.codes.astype(codes_min_signed_type) cols_2_keep = ["wp_web_page_sk", "wp_type_codes"] wp = wp[cols_2_keep] wp = wp.persist() wait(wp) bc.create_table('web_page', wp) query = """ SELECT c.wcs_user_sk, w.wp_type_codes, (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec FROM web_clickstreams c, web_page w WHERE c.wcs_web_page_sk = w.wp_web_page_sk AND c.wcs_web_page_sk IS NOT NULL AND c.wcs_user_sk IS NOT NULL AND c.wcs_sales_sk IS NULL --abandoned implies: no sale ORDER BY wcs_user_sk, tstamp_inSec """ merged_df = bc.sql(query) keep_cols = ["wcs_user_sk", "wp_type_codes", "tstamp_inSec"] result_df = merged_df.map_partitions(reduction_function, keep_cols, DYNAMIC_CAT_CODE, ORDER_CAT_CODE) result = result_df["pagecount"].sum() / result_df["count"].sum() # Persist before computing to ensure scalar transfer only on compute result = result.persist() result = result.compute() result_df = cudf.DataFrame({"sum(pagecount)/count(*)": [result]}) bc.drop_table("web_page") return result_df
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = f""" WITH concat_table AS ( ( SELECT ss_customer_sk AS cid, count(distinct ss_ticket_number) AS frequency, max(ss_sold_date_sk) AS most_recent_date, CAST( SUM(ss_net_paid) AS DOUBLE) AS amount FROM store_sales ss JOIN date_dim d ON ss.ss_sold_date_sk = d.d_date_sk WHERE CAST(d.d_date AS DATE) > DATE '{q25_date}' AND ss_customer_sk IS NOT NULL GROUP BY ss_customer_sk ) union all ( SELECT ws_bill_customer_sk AS cid, count(distinct ws_order_number) AS frequency, max(ws_sold_date_sk) AS most_recent_date, CAST( SUM(ws_net_paid) AS DOUBLE) AS amount FROM web_sales ws JOIN date_dim d ON ws.ws_sold_date_sk = d.d_date_sk WHERE CAST(d.d_date AS DATE) > DATE '{q25_date}' AND ws_bill_customer_sk IS NOT NULL GROUP BY ws_bill_customer_sk ) ) SELECT cid AS cid, CASE WHEN 37621 - max(most_recent_date) < 60 THEN 1.0 ELSE 0.0 END AS recency, -- 37621 == 2003-01-02 CAST( SUM(frequency) AS BIGINT) AS frequency, --total frequency CAST( SUM(amount) AS DOUBLE) AS amount --total amount FROM concat_table GROUP BY cid ORDER BY cid """ cluster_input_ddf = bc.sql(query) # Prepare df for KMeans clustering cluster_input_ddf["recency"] = cluster_input_ddf["recency"].astype("int64") cluster_input_ddf = cluster_input_ddf.repartition(npartitions=1) cluster_input_ddf = cluster_input_ddf.persist() cluster_input_ddf = cluster_input_ddf.set_index('cid') results_dict = get_clusters(client=client, ml_input_df=cluster_input_ddf) return results_dict
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_1 = """ SELECT i_item_sk, CAST(i_category_id AS TINYINT) AS i_category_id FROM item """ item_df = bc.sql(query_1) bc.create_table("item_df", item_df) query_2 = """ SELECT CAST(w.wcs_user_sk AS INTEGER) as wcs_user_sk, wcs_click_date_sk * 86400 + wcs_click_time_sk AS tstamp, CAST(w.wcs_item_sk AS INTEGER) as wcs_item_sk, COALESCE(w.wcs_sales_sk, 0) as wcs_sales_sk, i.i_category_id FROM web_clickstreams AS w INNER JOIN item_df AS i ON w.wcs_item_sk = i.i_item_sk WHERE w.wcs_user_sk IS NOT NULL AND w.wcs_item_sk IS NOT NULL ORDER BY w.wcs_user_sk """ merged_df = bc.sql(query_2) query_3 = f""" SELECT i_item_sk, i_category_id FROM item_df WHERE i_category_id IN ({q03_purchased_item_category_IN}) """ item_df_filtered = bc.sql(query_3) product_view_results = merged_df.map_partitions( apply_find_items_viewed, item_mappings=item_df_filtered ) del merged_df bc.create_table('product_result', product_view_results) last_query = f""" SELECT CAST({q03_purchased_item_IN} AS BIGINT) AS purchased_item, i_item_sk AS lastviewed_item, COUNT(i_item_sk) AS cnt FROM product_result GROUP BY i_item_sk ORDER BY purchased_item, cnt desc, lastviewed_item LIMIT {q03_limit} """ result = bc.sql(last_query) return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_1 = """ SELECT CAST(wcs_user_sk AS INTEGER) AS wcs_user_sk, CAST(wcs_item_sk AS INTEGER) AS wcs_item_sk, (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec FROM web_clickstreams WHERE wcs_item_sk IS NOT NULL AND wcs_user_sk IS NOT NULL ORDER BY wcs_user_sk """ wcs_result = bc.sql(query_1) session_df = wcs_result.map_partitions( get_distinct_sessions, keep_cols=["wcs_user_sk", "wcs_item_sk"], time_out=q02_session_timeout_inSec, ) del wcs_result session_df = session_df.persist() wait(session_df) bc.create_table('session_df', session_df) last_query = f""" WITH item_df AS ( SELECT wcs_user_sk, session_id FROM session_df WHERE wcs_item_sk = {q02_item_sk} ) SELECT sd.wcs_item_sk as item_sk_1, count(sd.wcs_item_sk) as cnt FROM session_df sd INNER JOIN item_df id ON sd.wcs_user_sk = id.wcs_user_sk AND sd.session_id = id.session_id AND sd.wcs_item_sk <> {q02_item_sk} GROUP BY sd.wcs_item_sk ORDER BY cnt desc LIMIT {q02_limit} """ result = bc.sql(last_query) result["item_sk_2"] = q02_item_sk result_order = ["item_sk_1", "item_sk_2", "cnt"] result = result[result_order] del session_df bc.drop_table("session_df") return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = """ WITH temp_table as ( SELECT i_item_sk, imp_sk, (imp_competitor_price - i_current_price) / i_current_price AS price_change, imp_start_date, (imp_end_date - imp_start_date) AS no_days_comp_price FROM item i ,item_marketprices imp WHERE i.i_item_sk = imp.imp_item_sk AND i.i_item_sk = 10000 ORDER BY i_item_sk, imp_sk, imp_start_date ) SELECT ws_item_sk, -- avg ( (current_ss_quant + current_ws_quant - prev_ss_quant - prev_ws_quant) / ((prev_ss_quant + prev_ws_quant) * ws.price_change) ) -- single node sum( (current_ss_quant+current_ws_quant-prev_ss_quant-prev_ws_quant) / (prev_ss_quant*ws.price_change+prev_ws_quant*ws.price_change) ) / count( (current_ss_quant + current_ws_quant - prev_ss_quant - prev_ws_quant) / ((prev_ss_quant + prev_ws_quant) * ws.price_change) ) AS cross_price_elasticity FROM ( SELECT ws_item_sk, imp_sk, price_change, SUM( CASE WHEN ( (ws_sold_date_sk >= c.imp_start_date) AND (ws_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ws_quantity ELSE 0 END ) AS current_ws_quant, SUM( CASE WHEN ( (ws_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ws_sold_date_sk < c.imp_start_date)) THEN ws_quantity ELSE 0 END ) AS prev_ws_quant FROM web_sales ws JOIN temp_table c ON ws.ws_item_sk = c.i_item_sk GROUP BY ws_item_sk, imp_sk, price_change ) ws JOIN ( SELECT ss_item_sk, imp_sk, price_change, SUM( CASE WHEN ((ss_sold_date_sk >= c.imp_start_date) AND (ss_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ss_quantity ELSE 0 END) AS current_ss_quant, SUM( CASE WHEN ((ss_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ss_sold_date_sk < c.imp_start_date)) THEN ss_quantity ELSE 0 END) AS prev_ss_quant FROM store_sales ss JOIN temp_table c ON c.i_item_sk = ss.ss_item_sk GROUP BY ss_item_sk, imp_sk, price_change ) ss ON (ws.ws_item_sk = ss.ss_item_sk and ws.imp_sk = ss.imp_sk) GROUP BY ws.ws_item_sk """ result = bc.sql(query) return result
def main(client, config): q_st = time.time() product_reviews_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) product_reviews_df = product_reviews_df[ product_reviews_df["pr_review_content"].notnull() ] # 90% train/test split train_data, test_data = product_reviews_df.random_split([0.9, 0.10]) train_data = train_data.reset_index(drop=True) test_data = test_data.reset_index(drop=True) del product_reviews_df final_data, acc, prec, cmat = post_etl_processing( client=client, train_data=train_data, test_data=test_data ) payload = { "df": final_data, "acc": acc, "prec": prec, "cmat": cmat, "output_type": "supervised", } return payload
def main(client, config): import cudf model_path = os.path.join(config["data_dir"], "../../q27_model_dir") product_reviews_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) product_reviews_df = product_reviews_df[product_reviews_df.pr_item_sk == q27_pr_item_sk].persist() meta_d = { "review_sk": np.ones(1, dtype=np.int64), "item_sk": np.ones(1, dtype=np.int64), "company_name": "", "review_sentence": "", } meta_df = cudf.DataFrame(meta_d) output_df = product_reviews_df.map_partitions(run_single_part_workflow, model_path, meta=meta_df) output_df = output_df.persist() wait(output_df) client.run(del_model_attribute) return output_df
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) # 10 % of data query1 = """ SELECT pr_review_sk, pr_review_rating, pr_review_content FROM product_reviews WHERE mod(pr_review_sk, 10) IN (0) AND pr_review_content IS NOT NULL -- in a near future we want to use ORDER BY again --ORDER BY pr_review_sk """ test_data = bc.sql(query1) # in a near future we want to reuse ORDER BY instead of bc.partition() test_data = bc.partition(test_data, by=["pr_review_sk"]) # 90 % of data query2 = """ SELECT pr_review_sk, pr_review_rating, pr_review_content FROM product_reviews WHERE mod(pr_review_sk, 10) IN (1,2,3,4,5,6,7,8,9) AND pr_review_content IS NOT NULL --ORDER BY pr_review_sk """ train_data = bc.sql(query2) # in a near future we want to reuse ORDER BY instead of bc.partition() train_data = bc.partition(train_data, by=["pr_review_sk"]) final_data, acc, prec, cmat = post_etl_processing(client=client, train_data=train_data, test_data=test_data) payload = { "df": final_data, "acc": acc, "prec": prec, "cmat": cmat, "output_type": "supervised", } return payload
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query_date = f""" select min(d_date_sk) as min_d_date_sk, max(d_date_sk) as max_d_date_sk from date_dim where d_year = {q17_year} and d_moy = {q17_month} """ dates_result = bc.sql(query_date).compute() min_date_sk_val = dates_result["min_d_date_sk"][0] max_date_sk_val = dates_result["max_d_date_sk"][0] query = f""" SELECT sum(promotional) as promotional, sum(total) as total, CASE WHEN sum(total) > 0.0 THEN (100.0 * sum(promotional)) / sum(total) ELSE 0.0 END as promo_percent FROM ( SELECT p_channel_email, p_channel_dmail, p_channel_tv, SUM( CAST(ss_ext_sales_price AS DOUBLE) ) total, CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y') THEN SUM(CAST(ss_ext_sales_price AS DOUBLE)) ELSE 0 END as promotional FROM store_sales ss INNER JOIN promotion p ON ss.ss_promo_sk = p.p_promo_sk inner join item i on ss.ss_item_sk = i.i_item_sk inner join store s on ss.ss_store_sk = s.s_store_sk inner join customer c on c.c_customer_sk = ss.ss_customer_sk inner join customer_address ca on c.c_current_addr_sk = ca.ca_address_sk WHERE i.i_category IN ({q17_i_category_IN}) AND s.s_gmt_offset = {q17_gmt_offset} AND ca.ca_gmt_offset = {q17_gmt_offset} AND ss.ss_sold_date_sk >= {min_date_sk_val} AND ss.ss_sold_date_sk <= {max_date_sk_val} GROUP BY p_channel_email, p_channel_dmail, p_channel_tv ) sum_promotional -- we don't need a 'ON' join condition. result is just two numbers. """ result = bc.sql(query) return result
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = f""" SELECT w_warehouse_name, i_item_id, SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END) AS inv_before, SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS inv_after FROM inventory inv, item i, warehouse w, date_dim d WHERE i_current_price BETWEEN {q22_i_current_price_min} AND {q22_i_current_price_max} AND i_item_sk = inv_item_sk AND inv_warehouse_sk = w_warehouse_sk AND inv_date_sk = d_date_sk AND timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= -30 AND timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 <= 30 GROUP BY w_warehouse_name, i_item_id HAVING SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date}', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END) > 0 AND ( CAST( SUM (CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS DOUBLE) / CAST( SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END) AS DOUBLE) >= 0.666667 ) AND ( CAST( SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS DOUBLE) / CAST ( SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END) AS DOUBLE) <= 1.50 ) ORDER BY w_warehouse_name, i_item_id LIMIT 100 """ result = bc.sql(query) return result
def main(client, config): import cudf wp, wcs_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ### downcasting the column inline with q03 wcs_df["wcs_user_sk"] = wcs_df["wcs_user_sk"].astype("int32") f_wcs_df = wcs_df[wcs_df["wcs_web_page_sk"].notnull() & wcs_df["wcs_user_sk"].notnull() & wcs_df["wcs_sales_sk"].isnull()].reset_index(drop=True) f_wcs_df["tstamp_inSec"] = (f_wcs_df["wcs_click_date_sk"] * 24 * 60 * 60 + f_wcs_df["wcs_click_time_sk"]) keep_cols = ["wcs_user_sk", "tstamp_inSec", "wcs_web_page_sk"] f_wcs_df = f_wcs_df[keep_cols] f_wcs_df = f_wcs_df.repartition(columns=["wcs_user_sk"]) # Convert wp_type to categorical and get cat_id of review and dynamic type wp["wp_type"] = wp["wp_type"].map_partitions( lambda ser: ser.astype("category")) cpu_categories = wp["wp_type"].compute().cat.categories.to_pandas() DYNAMIC_CAT_CODE = cpu_categories.get_loc("dynamic") ORDER_CAT_CODE = cpu_categories.get_loc("order") # ### cast to minimum viable dtype codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(cpu_categories)) wp["wp_type_codes"] = wp["wp_type"].cat.codes.astype(codes_min_signed_type) cols_2_keep = ["wp_web_page_sk", "wp_type_codes"] wp = wp[cols_2_keep] # Continue remaining workflow with wp_type as category codes merged_df = f_wcs_df.merge(wp, left_on="wcs_web_page_sk", right_on="wp_web_page_sk", how="inner") merged_df = merged_df[["wcs_user_sk", "tstamp_inSec", "wp_type_codes"]] keep_cols = ["wcs_user_sk", "wp_type_codes", "tstamp_inSec"] result_df = merged_df.map_partitions(reduction_function, keep_cols, DYNAMIC_CAT_CODE, ORDER_CAT_CODE) result = result_df["pagecount"].sum() / result_df["count"].sum() # Persist before computing to ensure scalar transfer only on compute result = result.persist() result = result.compute() result_df = cudf.DataFrame({"sum(pagecount)/count(*)": [result]}) return result_df
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = """ WITH temp_table as ( SELECT k.i_item_sk FROM item k, ( SELECT i_category, SUM(j.i_current_price) / COUNT(j.i_current_price) * 1.2 AS avg_price FROM item j GROUP BY j.i_category ) avgCategoryPrice WHERE avgCategoryPrice.i_category = k.i_category AND k.i_current_price > avgCategoryPrice.avg_price ) SELECT ca_state, COUNT(*) AS cnt FROM customer_address a, customer c, store_sales s, temp_table highPriceItems WHERE a.ca_address_sk = c.c_current_addr_sk AND c.c_customer_sk = s.ss_customer_sk AND ca_state IS NOT NULL AND ss_item_sk = highPriceItems.i_item_sk AND s.ss_sold_date_sk IN ( SELECT d_date_sk FROM date_dim WHERE d_year = 2004 AND d_moy = 7 ) GROUP BY ca_state HAVING COUNT(*) >= 10 ORDER BY cnt DESC, ca_state LIMIT 10 """ result = bc.sql(query) return result
def main(client, config): item_df, ws_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ### setting index on ws_order_number ws_df = ws_df.shuffle(on=["ws_order_number"]) ### at sf-100k we will have max of 17M rows and 17 M rows with 2 columns, 1 part is very reasonable item_df = item_df.repartition(npartitions=1) # SELECT DISTINCT i_category_id, ws_order_number # FROM web_sales ws, item i # WHERE ws.ws_item_sk = i.i_item_sk # AND i.i_category_id IS NOT NULL f_item_df = item_df[item_df["i_category_id"].notnull()] ### doing below to retain the `ws_order_number` partition boundry after merge ws_item_join = ws_df.merge(f_item_df, left_on=["ws_item_sk"], right_on=["i_item_sk"]) ws_item_join = ws_item_join[["i_category_id", "ws_order_number"]] ws_item_join = ws_item_join.map_partitions(lambda df: df.drop_duplicates()) ### do pair inner join ### pair_df = get_pairs(ws_item_join) ### because of setting index we can do it in map_partitions ### this can have better memory and scaling props at larger scale factors pair_df = ws_item_join.map_partitions(get_pairs) # SELECT category_id_1, category_id_2, COUNT (*) AS cnt # FROM ( # ... # ) # GROUP BY category_id_1, category_id_2 # ORDER BY cnt DESC, category_id_1, category_id_2 # LIMIT {q29_limit} grouped_df = pair_df.groupby(["category_id_1", "category_id_2"]).size().persist() ### 36 rows after filtration at sf-100 ### should scale till sf-100k grouped_df = grouped_df.reset_index().compute() grouped_df.columns = ["category_id_1", "category_id_2", "cnt"] grouped_df["category_id_1"] = grouped_df["category_id_1"] grouped_df["category_id_2"] = grouped_df["category_id_2"] grouped_df = grouped_df.sort_values( by=["cnt", "category_id_1", "category_id_2"], ascending=[False, True, True]).reset_index(drop=True) grouped_df = grouped_df.head(q29_limit) return grouped_df
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = f""" SELECT ss.ss_customer_sk AS cid, CAST( count(CASE WHEN i.i_class_id=1 THEN 1 ELSE NULL END) AS DOUBLE ) AS id1, CAST( count(CASE WHEN i.i_class_id=2 THEN 1 ELSE NULL END) AS DOUBLE ) AS id2, CAST( count(CASE WHEN i.i_class_id=3 THEN 1 ELSE NULL END) AS DOUBLE ) AS id3, CAST( count(CASE WHEN i.i_class_id=4 THEN 1 ELSE NULL END) AS DOUBLE ) AS id4, CAST( count(CASE WHEN i.i_class_id=5 THEN 1 ELSE NULL END) AS DOUBLE ) AS id5, CAST( count(CASE WHEN i.i_class_id=6 THEN 1 ELSE NULL END) AS DOUBLE ) AS id6, CAST( count(CASE WHEN i.i_class_id=7 THEN 1 ELSE NULL END) AS DOUBLE ) AS id7, CAST( count(CASE WHEN i.i_class_id=8 THEN 1 ELSE NULL END) AS DOUBLE ) AS id8, CAST( count(CASE WHEN i.i_class_id=9 THEN 1 ELSE NULL END) AS DOUBLE ) AS id9, CAST( count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS DOUBLE ) AS id10, CAST( count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS DOUBLE ) AS id11, CAST( count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS DOUBLE ) AS id12, CAST( count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS DOUBLE ) AS id13, CAST( count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS DOUBLE ) AS id14, CAST( count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS DOUBLE ) AS id15 FROM store_sales ss INNER JOIN item i ON ( ss.ss_item_sk = i.i_item_sk AND i.i_category IN ('{q26_i_category_IN}') AND ss.ss_customer_sk IS NOT NULL ) GROUP BY ss.ss_customer_sk HAVING count(ss.ss_item_sk) > {q26_count_ss_item_sk} ORDER BY cid """ result = bc.sql(query) result = result.repartition(npartitions=1) result_ml = result.set_index('cid') ml_result_dict = get_clusters(client=client, kmeans_input_df=result_ml) return ml_result_dict
def main(client, config): wcs_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ### filter nulls # SELECT # wcs_user_sk, # wcs_item_sk, # (wcs_click_date_sk * 24 * 60 * 60 + wcs_click_time_sk) AS tstamp_inSec # FROM web_clickstreams # WHERE wcs_item_sk IS NOT NULL # AND wcs_user_sk IS NOT NULL f_wcs_df = wcs_df.map_partitions(pre_repartition_task) f_wcs_df = f_wcs_df.shuffle(on=["wcs_user_sk"]) ### Main Query # SELECT # item_sk_1,${hiveconf:q02_item_sk} AS item_sk_2, COUNT (*) AS cnt # FROM # ( # ) # GROUP BY item_sk_1 # ORDER BY # cnt DESC, # item_sk_1 # LIMIT ${hiveconf:q02_limit}; # q02_limit=30 grouped_df = f_wcs_df.map_partitions(reduction_function, q02_session_timeout_inSec) items_value_counts = grouped_df.groupby(["i_item_sk"]).cnt.sum() items_value_counts = items_value_counts.map_partitions( lambda ser: ser.sort_values(ascending=False)) ### final calculation on 30 values result_df = items_value_counts.reset_index(drop=False) result_df.columns = ["item_sk_1", "cnt"] result_df = result_df.head(q02_limit) result_df["item_sk_2"] = q02_item_sk result_order = ["item_sk_1", "item_sk_2", "cnt"] result_df = result_df[result_order] return result_df
def main(data_dir, client, bc, config): benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"]) query = """ SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio FROM ( SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc FROM ( SELECT CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1, CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1 FROM web_sales ws JOIN household_demographics hd ON (hd.hd_demo_sk = ws.ws_ship_hdemo_sk and hd.hd_dep_count = 5) JOIN web_page wp ON (wp.wp_web_page_sk = ws.ws_web_page_sk and wp.wp_char_count BETWEEN 5000 AND 6000) JOIN time_dim td ON (td.t_time_sk = ws.ws_sold_time_sk and td.t_hour IN (7,8,19,20)) GROUP BY t_hour ) cnt_am_pm ) sum_am_pm """ result = bc.sql(query) return result
def main(client, config): import cudf ss_ddf, items_ddf = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) items_filtered = items_ddf[items_ddf.i_category == Q26_CATEGORY].reset_index(drop=True) items_filtered = items_filtered[["i_item_sk", "i_class_id"]] f_ss_ddf = ss_ddf[ss_ddf["ss_customer_sk"].notnull()].reset_index( drop=True) merged_ddf = f_ss_ddf.merge(items_filtered, left_on="ss_item_sk", right_on="i_item_sk", how="inner") keep_cols = ["ss_customer_sk", "i_class_id"] merged_ddf = merged_ddf[keep_cols] # One-Hot-Encode i_class_id merged_ddf = merged_ddf.map_partitions( cudf.DataFrame.one_hot_encoding, column="i_class_id", prefix="id", cats=[i for i in range(1, 16)], prefix_sep="", dtype="float32", ) merged_ddf["total"] = 1.0 # Will keep track of total count all_categories = ["total"] + ["id%d" % i for i in range(1, 16)] # Aggregate using agg to get sorted ss_customer_sk agg_dict = dict.fromkeys(all_categories, "sum") rollup_ddf = merged_ddf.groupby("ss_customer_sk").agg(agg_dict) rollup_ddf = rollup_ddf[rollup_ddf.total > Q26_ITEM_COUNT][ all_categories[1:]] # Prepare data for KMeans clustering rollup_ddf = rollup_ddf.astype("float64") kmeans_input_df = rollup_ddf.persist() results_dict = get_clusters(client=client, kmeans_input_df=kmeans_input_df) return results_dict
def main(client, config): date_df, inv_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) expr = ( f"d_year == {q23_year} and (d_moy >= {q23_month} and d_moy <= {q23_month + 1})" ) selected_dates_df = date_df.query(expr) merged_inv_dates = inv_df.merge(selected_dates_df, left_on="inv_date_sk", right_on="d_date_sk", how="inner") n_workers = len(client.scheduler_info()["workers"]) iteration1_df = get_iteration1(merged_inv_dates, n_workers) # Select only the columns we are interested in iteration1_df = iteration1_df[[ "inv_warehouse_sk", "inv_item_sk", "d_moy", "qty_cov" ]].repartition(npartitions=1) # iteration1_df has 40k rows at sf-100 expr_1 = f"d_moy == {q23_month}" inv1_df = iteration1_df.query(expr_1) # inv1_df has 13k rows at sf-100 expr_2 = f"d_moy == {q23_month + 1}" inv2_df = iteration1_df.query(expr_2) # 31k rows at sf-100 result_df = inv1_df.merge(inv2_df, on=["inv_warehouse_sk", "inv_item_sk"]) result_df = result_df.rename( columns={ "d_moy_x": "d_moy", "d_moy_y": "inv2_d_moy", "qty_cov_x": "cov", "qty_cov_y": "inv2_cov", }) result_df = result_df.persist() result_df = result_df.sort_values(by=["inv_warehouse_sk", "inv_item_sk"]) result_df = result_df.reset_index(drop=True) result_df = result_df.persist() wait(result_df) return result_df
def main(client, config): ws_df, item_df, imp_df, ss_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ## helper table item_imp_join_df = get_helper_query_table(imp_df, item_df) r_ss = get_ss(ss_df, item_imp_join_df) r_ws = get_ws(ws_df, item_imp_join_df) result_df = r_ws.merge( r_ss, left_on=["ws_item_sk", "imp_sk"], right_on=["ss_item_sk", "imp_sk"], how="inner", suffixes=("ws", "ss"), ) result_df["cross_price_elasticity"] = (result_df["current_ss_quant"] + result_df["current_ws_quant"] - result_df["prev_ss_quant"] - result_df["prev_ws_quant"]) result_df[ "cross_price_elasticity"] = result_df["cross_price_elasticity"] / ( (result_df["prev_ss_quant"] + result_df["prev_ws_quant"]) * result_df["price_change"]) final_cols_2_keep = ["ws_item_sk", "cross_price_elasticity"] result_df = result_df[final_cols_2_keep] result_df = result_df.groupby(["ws_item_sk" ]).agg({"cross_price_elasticity": "mean"}) result_df = result_df.reset_index(drop=False) wait(result_df) return result_df
def main(client, config): item_df, ss_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) # SELECT DISTINCT ss_item_sk,ss_ticket_number # FROM store_sales s, item i # -- Only products in certain categories sold in specific stores are considered, # WHERE s.ss_item_sk = i.i_item_sk # AND i.i_category_id IN ({q01_i_category_id_IN}) # AND s.ss_store_sk IN ({q01_ss_store_sk_IN}) f_ss_df = ss_df.loc[ss_df["ss_store_sk"].isin(q01_ss_store_sk_IN)][ ["ss_item_sk", "ss_ticket_number"] ].reset_index(drop=True) f_item_df = item_df.loc[item_df["i_category_id"].isin(q01_i_category_id_IN)][ ["i_item_sk"] ].reset_index(drop=True) ss_item_join = f_item_df.merge( f_ss_df, left_on=["i_item_sk"], right_on=["ss_item_sk"] ) ss_item_join = ss_item_join[["ss_item_sk", "ss_ticket_number"]] ## keep to a single partitions ## We only have 41,910,265 rows in the dataframe at sf-10k and dont need to split_out. ss_item_join = ss_item_join.drop_duplicates() ### do pair inner join pair_df = get_pairs(ss_item_join) # SELECT item_sk_1, item_sk_2, COUNT(*) AS cnt # FROM # ( # ... # ) # GROUP BY item_sk_1, item_sk_2 # -- 'frequently' # HAVING cnt > {q01_viewed_together_count} # ORDER BY cnt DESC, item_sk_1, item_sk_2 grouped_df = ( pair_df.groupby(["item_sk_1", "item_sk_2"]) .size() .reset_index() .rename(columns={0: "cnt"}) ) grouped_df = grouped_df[grouped_df["cnt"] > q01_viewed_together_count].reset_index( drop=True ) ### 2017 rows after filteration at sf-100 ### should scale till sf-100k grouped_df = grouped_df.repartition(npartitions=1).persist() ## converting to strings because of issue # https://github.com/rapidsai/tpcx-bb/issues/36 grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("str") grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("str") grouped_df = grouped_df.map_partitions( lambda df: df.sort_values( by=["cnt", "item_sk_1", "item_sk_2"], ascending=[False, True, True] ) ) grouped_df = grouped_df.reset_index(drop=True) ### below is just 100 rows so should fit on `cudf` context grouped_df = grouped_df.head(q01_limit) ### writing to int to ensure same values grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("int32") grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("int32") return grouped_df
def main(client, config): import dask_cudf import cudf item_df = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) wcs_tstamp_min = get_wcs_minima(config) item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32") item_df["i_category_id"] = item_df["i_category_id"].astype("int8") # we eventually will only care about these categories, so we can filter now item_df_filtered = item_df.loc[item_df.i_category_id.isin( q03_purchased_item_category_IN)].reset_index(drop=True) # The main idea is that we don't fuse a filtration task with reading task yet # this causes more memory pressures as we try to read the whole thing ( and spill that) # at once and then do filtration . ### Below Pr has the dashboard snapshot which makes the problem clear ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141 web_clickstream_flist = glob.glob( os.path.join(config["data_dir"], "web_clickstreams/*.parquet")) task_ls = [ delayed(pre_repartition_task)(fn, item_df.to_delayed()[0], wcs_tstamp_min) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int32), "tstamp": np.ones(1, dtype=np.int32), "wcs_item_sk": np.ones(1, dtype=np.int32), "wcs_sales_sk": np.ones(1, dtype=np.int32), "i_category_id": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.shuffle(on="wcs_user_sk") meta_d = { "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype), } meta_df = cudf.DataFrame(meta_d) grouped_df = merged_df.map_partitions(reduction_function, item_df_filtered.to_delayed()[0], meta=meta_df) ### todo: check if this has any impact on stability grouped_df = grouped_df.persist(priority=10000) ### todo: remove this later after more testing wait(grouped_df) print("---" * 20) print("grouping complete ={}".format(len(grouped_df))) grouped_df = grouped_df.groupby(["i_item_sk" ]).sum(split_every=2).reset_index() grouped_df.columns = ["i_item_sk", "cnt"] result_df = grouped_df.map_partitions( lambda df: df.sort_values(by=["cnt"], ascending=False)) result_df.columns = ["lastviewed_item", "cnt"] result_df["purchased_item"] = q03_purchased_item_IN cols_order = ["purchased_item", "lastviewed_item", "cnt"] result_df = result_df[cols_order] result_df = result_df.persist() ### todo: remove this later after more testing wait(result_df) print(len(result_df)) result_df = result_df.head(q03_limit) print("result complete") print("---" * 20) return result_df
def main(client, config): import cudf import dask_cudf (date_dim_df, web_page_df, web_sales_df) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days) q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int) q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int) filtered_date_df = date_dim_cov_df.query( f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}", meta=date_dim_cov_df._meta, ).reset_index(drop=True) # Convert wp_type to categorical and get cat_id of review and dynamic type # see https://github.com/rapidsai/cudf/issues/4093 for more info web_page_df = web_page_df.persist() # map_partitions is a bit faster than ddf[col].astype('category') web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions( lambda ser: ser.astype("category")) cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas( ) REVIEW_CAT_CODE = cpu_categories.get_loc("review") # cast to minimum viable dtype codes_min_signed_type = cudf.utils.dtypes.min_signed_type( len(cpu_categories)) web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype( codes_min_signed_type) web_page_newcols = ["wp_web_page_sk", "wp_type_codes"] web_page_df = web_page_df[web_page_newcols] web_clickstream_flist = glob.glob(config["data_dir"] + "web_clickstreams/*.parquet") task_ls = [ delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0], web_page_df.to_delayed()[0]) for fn in web_clickstream_flist ] meta_d = { "wcs_user_sk": np.ones(1, dtype=np.int64), "tstamp_inSec": np.ones(1, dtype=np.int64), "wcs_sales_sk": np.ones(1, dtype=np.int64), "wp_type_codes": np.ones(1, dtype=np.int8), } meta_df = cudf.DataFrame(meta_d) merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df) merged_df = merged_df.repartition(columns=["wcs_user_sk"]) reviewed_sales = merged_df.map_partitions( reduction_function, REVIEW_CAT_CODE, meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}), ) reviewed_sales = reviewed_sales.persist() wait(reviewed_sales) del merged_df all_sales_in_year = filtered_date_df.merge(web_sales_df, left_on=["d_date_sk"], right_on=["ws_sold_date_sk"], how="inner") all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]] all_sales_in_year = all_sales_in_year.persist() wait(all_sales_in_year) # note: switch to mainline # once https://github.com/dask/dask/pull/6066 # lands q08_reviewed_sales = hash_merge( lhs=all_sales_in_year, rhs=reviewed_sales, left_on=["ws_order_number"], right_on=["wcs_sales_sk"], how="inner", ) q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum() q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum() q08_reviewed_sales_sum, q08_all_sales_sum = client.compute( [q08_reviewed_sales_sum, q08_all_sales_sum]) q08_reviewed_sales_sum, q08_all_sales_sum = ( q08_reviewed_sales_sum.result(), q08_all_sales_sum.result(), ) no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum final_result_df = cudf.DataFrame() final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum] final_result_df["q08_review_sales_amount"] = final_result_df[ "q08_review_sales_amount"].astype("int") final_result_df["no_q08_review_sales_amount"] = [ no_q08_review_sales_amount ] final_result_df["no_q08_review_sales_amount"] = final_result_df[ "no_q08_review_sales_amount"].astype("int") return final_result_df
def main(client, config): import cudf import dask_cudf store_sales, date_dim, store, product_reviews = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ### adding a wait call slows this down by 3-4 seconds, removing it for now ### Make TEMP_TABLE1 # filter date table q18_startDate_int = np.datetime64(q18_startDate, "ms").astype(int) q18_endDate_int = np.datetime64(q18_endDate, "ms").astype(int) date_dim_filtered = date_dim.loc[ (date_dim.d_date.astype("datetime64[ms]").astype("int") >= q18_startDate_int) & (date_dim.d_date.astype("datetime64[ms]").astype("int") <= q18_endDate_int)].reset_index(drop=True) # build the regression_analysis table ss_date_dim_join = left_semi_join( store_sales, date_dim_filtered, left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], ) temp = (ss_date_dim_join.groupby(["ss_store_sk", "ss_sold_date_sk"], ).agg( { "ss_net_paid": "sum" }).reset_index()) temp["xx"] = temp.ss_sold_date_sk * temp.ss_sold_date_sk temp["xy"] = temp.ss_sold_date_sk * temp.ss_net_paid temp.columns = ["ss_store_sk", "x", "y", "xx", "xy"] regression_analysis = (temp.groupby(["ss_store_sk"]).agg({ "x": ["count", "sum"], "xy": "sum", "y": "sum", "xx": "sum" }).reset_index(drop=False)) regression_analysis["slope"] = ( regression_analysis[("x", "count")] * regression_analysis[ ("xy", "sum")] - regression_analysis[("x", "sum")] * regression_analysis[("y", "sum")] ) / (regression_analysis[("x", "count")] * regression_analysis[ ("xx", "sum")] - regression_analysis[("x", "sum")] * regression_analysis[("x", "sum")]) regression_analysis = regression_analysis[["ss_store_sk", "slope"]] regression_analysis.columns = ["ss_store_sk", "slope"] regression_analysis["ss_store_sk"] = regression_analysis[ "ss_store_sk"].astype("int32") store["s_store_sk"] = store["s_store_sk"].astype("int32") temp_table1 = store.merge( regression_analysis[["ss_store_sk", "slope" ]].query("slope <= 0").reset_index(drop=True), left_on="s_store_sk", right_on="ss_store_sk", ) temp_table1 = temp_table1[["s_store_sk", "s_store_name"]] # repartition this table to be one partition, since its only 192 at SF1000 temp_table1 = temp_table1.repartition(npartitions=1) temp_table1 = temp_table1.persist() ### Make TEMP_TABLE2 stores_with_regression = temp_table1 pr = product_reviews # known to be small. very few relevant stores (169) at SF1000 targets = (stores_with_regression.s_store_name.str.lower().unique(). compute().tolist()) n_targets = len(targets) no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True) no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32") ### perssiting because no_nulls is used twice no_nulls = no_nulls.reset_index(drop=True).persist() temp_table2_meta_empty_df = cudf.DataFrame({ "word": ["a"], "pr_review_sk": np.ones(1, dtype=np.int64), "pr_review_date": ["a"], }).head(0) ### get relevant reviews combined = no_nulls.map_partitions( find_relevant_reviews, targets, meta=temp_table2_meta_empty_df, ) stores_with_regression[ "store_ID"] = stores_with_regression.s_store_sk.astype("str").str.cat( stores_with_regression.s_store_name, sep="_") stores_with_regression[ "s_store_name"] = stores_with_regression.s_store_name.str.lower() # Keep this commented line to illustrate that we could exactly match Spark # temp_table2 = temp_table2[['store_ID', 'pr_review_date', 'pr_review_content']] temp_table2 = combined.merge(stores_with_regression, how="inner", left_on=["word"], right_on=["s_store_name"]) temp_table2 = temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]] temp_table2 = temp_table2.persist() ### REAL QUERY (PART THREE) no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace( [". ", "? ", "! "], [EOL_CHAR], regex=False) sentences = no_nulls.map_partitions(create_sentences_from_reviews) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] # This file comes from the official TPCx-BB kit # We extracted it from bigbenchqueriesmr.jar sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] + ["sentiment_files"]) with open(f"{sentiment_dir}/negativeSentiment.txt") as fh: negativeSentiment = list(map(str.strip, fh.readlines())) # dedupe for one extra record in the source file negativeSentiment = list(set(negativeSentiment)) word_df = sentences.map_partitions( create_words_from_sentences, global_position_column="sentence_tokenized_global_pos", ) sent_df = cudf.DataFrame({"word": negativeSentiment}) sent_df["sentiment"] = "NEG" sent_df = dask_cudf.from_cudf(sent_df, npartitions=1) word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word") word_sentence_sentiment[ "sentence_idx_global_pos"] = word_sentence_sentiment[ "sentence_idx_global_pos"].astype("int64") sentences["sentence_tokenized_global_pos"] = sentences[ "sentence_tokenized_global_pos"].astype("int64") word_sentence_sentiment_with_sentence_info = word_sentence_sentiment.merge( sentences, how="left", left_on="sentence_idx_global_pos", right_on="sentence_tokenized_global_pos", ) temp_table2["pr_review_sk"] = temp_table2["pr_review_sk"].astype("int32") final = word_sentence_sentiment_with_sentence_info.merge( temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]], how="inner", left_on="review_idx_global_pos", right_on="pr_review_sk", ) keepcols = ["store_ID", "pr_review_date", "sentence", "sentiment", "word"] final = final[keepcols] final.columns = [ "s_name", "r_date", "r_sentence", "sentiment", "sentiment_word" ] final = final.persist() wait(final) final = final.sort_values( ["s_name", "r_date", "r_sentence", "sentiment_word"]) final = final.persist() wait(final) print(len(final)) return final