def q17(lineitem, part): t1 = time.time() left = lineitem.loc[:, ["L_PARTKEY", "L_QUANTITY", "L_EXTENDEDPRICE"]] right = part[((part["P_BRAND"] == "Brand#23") & (part["P_CONTAINER"] == "MED BOX"))] right = right.loc[:, ["P_PARTKEY"]] line_part_merge = left.merge(right, left_on='L_PARTKEY', right_on='P_PARTKEY', how='inner') line_part_merge = line_part_merge.loc[:, [ "L_QUANTITY", "L_EXTENDEDPRICE", "P_PARTKEY" ]] lineitem_filtered = lineitem.loc[:, ["L_PARTKEY", "L_QUANTITY"]] lineitem_avg = lineitem_filtered.groupby( ["L_PARTKEY"], as_index=False).agg( avg=pd.NamedAgg(column="L_QUANTITY", aggfunc="mean")) lineitem_avg["avg"] = 0.2 * lineitem_avg["avg"] lineitem_avg = lineitem_avg.loc[:, ["L_PARTKEY", "avg"]] total = line_part_merge.merge(lineitem_avg, left_on='P_PARTKEY', right_on='L_PARTKEY', how='inner') total = total[total["L_QUANTITY"] < total["avg"]] total = pd.DataFrame( {"avg_yearly": [total["L_EXTENDEDPRICE"].sum() / 7.0]}) print(total) print("Q17 Execution time (s): ", time.time() - t1)
def q15(lineitem, supplier): t1 = time.time() lineitem_filtered = lineitem[ (lineitem["L_SHIPDATE"] >= pd.Timestamp('1996-01-01')) & (lineitem["L_SHIPDATE"] < (pd.Timestamp('1996-01-01') + pd.DateOffset(months=3)))] lineitem_filtered["REVENUE_PARTS"] = lineitem_filtered[ "L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"]) lineitem_filtered = lineitem_filtered.loc[:, ["L_SUPPKEY", "REVENUE_PARTS"]] revenue_table = lineitem_filtered.groupby( "L_SUPPKEY", as_index=False).agg(TOTAL_REVENUE=pd.NamedAgg( column="REVENUE_PARTS", aggfunc="sum")).rename( columns={"L_SUPPKEY": "SUPPLIER_NO"}, copy=False) max_revenue = revenue_table["TOTAL_REVENUE"].max() revenue_table = revenue_table[revenue_table["TOTAL_REVENUE"] == max_revenue] supplier_filtered = supplier.loc[:, [ "S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE" ]] total = supplier_filtered.merge(revenue_table, left_on="S_SUPPKEY", right_on="SUPPLIER_NO", how="inner") total = total.loc[:, [ "S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE" ]] print(total) print("Q15 Execution time (s): ", time.time() - t1)
def q11(partsupp, supplier, nation): t1 = time.time() partsupp_filtered = partsupp.loc[:, ["PS_PARTKEY", "PS_SUPPKEY"]] partsupp_filtered[ "TOTAL_COST"] = partsupp["PS_SUPPLYCOST"] * partsupp["PS_AVAILQTY"] supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]] ps_supp_merge = partsupp_filtered.merge(supplier_filtered, left_on='PS_SUPPKEY', right_on='S_SUPPKEY', how='inner') ps_supp_merge.loc[:, ["PS_PARTKEY", "S_NATIONKEY", "TOTAL_COST"]] nation_filtered = nation[(nation["N_NAME"] == "GERMANY")] nation_filtered = nation_filtered.loc[:, ["N_NATIONKEY"]] ps_supp_n_merge = ps_supp_merge.merge(nation_filtered, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') ps_supp_n_merge = ps_supp_n_merge.loc[:, ["PS_PARTKEY", "TOTAL_COST"]] sum_val = ps_supp_n_merge["TOTAL_COST"].sum() * 0.0001 total = ps_supp_n_merge.groupby(["PS_PARTKEY"], as_index=False).agg( VALUE=pd.NamedAgg(column="TOTAL_COST", aggfunc="sum")) total = total[total["VALUE"] > sum_val] total = total.sort_values("VALUE", ascending=False) print(total) print("Q11 Execution time (s): ", time.time() - t1)
def q13(customer, orders): t1 = time.time() customer_filtered = customer.loc[:, ["C_CUSTKEY"]] orders_filtered = orders[~orders["O_COMMENT"].str.contains("special(\S|\s)*requests")] orders_filtered = orders_filtered.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]] c_o_merged = customer_filtered.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='left') c_o_merged = c_o_merged.loc[:, ["C_CUSTKEY", "O_ORDERKEY"]] count_df = c_o_merged.groupby(["C_CUSTKEY"], as_index=False).agg(C_COUNT=pd.NamedAgg(column="O_ORDERKEY", aggfunc="count")) total = count_df.groupby(["C_COUNT"], as_index = False).size() total.columns = ["C_COUNT","CUSTDIST"] total = total.sort_values(by=["CUSTDIST","C_COUNT"], ascending=[False,False,]) print(total) print("Q13 Execution time (s): ", time.time() - t1)
def q22(customer, orders): t1 = time.time() customer_filtered = customer.loc[:, ["C_ACCTBAL", "C_CUSTKEY"]] customer_filtered["CNTRYCODE"] = customer["C_PHONE"].str.slice(0, 2) customer_filtered = customer_filtered[(customer["C_ACCTBAL"] > 0.00) & customer_filtered["CNTRYCODE"].isin(["13", "31", "23", "29", "30", "18", "17"])] avg_value = customer_filtered["C_ACCTBAL"].mean() customer_filtered = customer_filtered[customer_filtered["C_ACCTBAL"] > avg_value] # Select only the keys that don't match by performing a left join and only selecting columns with an na value orders_filtered = orders.loc[:, ["O_CUSTKEY"]].drop_duplicates() customer_keys = customer_filtered.loc[:, ["C_CUSTKEY"]].drop_duplicates() customer_selected = customer_keys.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='left') customer_selected= customer_selected[customer_selected["O_CUSTKEY"].isna()] customer_selected = customer_selected.loc[:, ["C_CUSTKEY"]] customer_selected = customer_selected.merge(customer_filtered, on="C_CUSTKEY", how="inner") customer_selected = customer_selected.loc[:, ["CNTRYCODE", "C_ACCTBAL"]] agg1 = customer_selected.groupby(["CNTRYCODE"], as_index = False).size() agg1.columns = ["CNTRYCODE", "NUMCUST"] agg2 = customer_selected.groupby(["CNTRYCODE"], as_index = False).agg(TOTACCTBAL=pd.NamedAgg(column="C_ACCTBAL", aggfunc="sum")) total = agg1.merge(agg2, on="CNTRYCODE", how="inner") total = total.sort_values(by=["CNTRYCODE",], ascending=[True,]) print(total) print("Q22 Execution time (s): ", time.time() - t1)
def q07(lineitem, supplier, orders, customer, nation): """ This version is faster than q07_old. Keeping the old one for reference """ t1 = time.time() lineitem_filtered = lineitem[(lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01")) & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01"))] lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].apply(lambda x: x.year) lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"]) lineitem_filtered = lineitem_filtered.loc[:, ["L_ORDERKEY", "L_SUPPKEY", "L_YEAR", "VOLUME"]] supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]] orders_filtered = orders.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]] customer_filtered = customer.loc[:, ["C_CUSTKEY", "C_NATIONKEY"]] n1 = nation[(nation["N_NAME"] == "FRANCE")].loc[:, ["N_NATIONKEY", "N_NAME"]] n2 = nation[(nation["N_NAME"] == "GERMANY")].loc[:, ["N_NATIONKEY", "N_NAME"]] # ----- do nation 1 ----- N1_C = customer_filtered.merge(n1, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner') N1_C = N1_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"}) N1_C_O = N1_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner') N1_C_O = N1_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"]) # NOTE: this is faster than first merging lineitem with N1_C_O N2_S = supplier_filtered.merge(n2, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') N2_S = N2_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"}) N2_S_L = N2_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner') N2_S_L = N2_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"]) total1 = N1_C_O.merge(N2_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner') total1 = total1.drop(columns=["O_ORDERKEY", "L_ORDERKEY"]) # ----- do nation 2 ----- (same as nation 1 section but with nation 2) N2_C = customer_filtered.merge(n2, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner') N2_C = N2_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"}) N2_C_O = N2_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner') N2_C_O = N2_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"]) N1_S = supplier_filtered.merge(n1, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') N1_S = N1_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"}) N1_S_L = N1_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner') N1_S_L = N1_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"]) total2 = N2_C_O.merge(N1_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner') total2 = total2.drop(columns=["O_ORDERKEY", "L_ORDERKEY"]) # concat results total = pd.concat([total1, total2]) total = total.groupby(["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index = False).agg(REVENUE=pd.NamedAgg(column="VOLUME", aggfunc="sum")) total = total.sort_values(by=["SUPP_NATION","CUST_NATION","L_YEAR"], ascending=[True,True,True,]) print(total) print("Q07 Execution time (s): ", time.time() - t1)