def q10(lineitem, orders, customer, nation): t1 = time.time() date1 = pd.Timestamp("1994-11-01") date2 = pd.Timestamp("1995-02-01") osel = (orders.O_ORDERDATE >= date1) & (orders.O_ORDERDATE < date2) lsel = lineitem.L_RETURNFLAG == "R" forders = orders[osel] flineitem = lineitem[lsel] jn1 = flineitem.merge(forders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") jn2 = jn1.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY") jn3 = jn2.merge(nation, left_on="C_NATIONKEY", right_on="N_NATIONKEY") jn3["TMP"] = jn3.L_EXTENDEDPRICE * (1.0 - jn3.L_DISCOUNT) gb = jn3.groupby( [ "C_CUSTKEY", "C_NAME", "C_ACCTBAL", "C_PHONE", "N_NAME", "C_ADDRESS", "C_COMMENT", ], as_index=False, )["TMP"].sum() total = gb.sort_values("TMP", ascending=False) print(total.head(20)) print("Q10 Execution time (s): ", time.time() - t1)
def q20(lineitem, part, nation, partsupp, supplier): t1 = time.time() date1 = pd.Timestamp("1996-01-01") date2 = pd.Timestamp("1997-01-01") psel = part.P_NAME.str.startswith("azure") nsel = nation.N_NAME == "JORDAN" lsel = (lineitem.L_SHIPDATE >= date1) & (lineitem.L_SHIPDATE < date2) fpart = part[psel] fnation = nation[nsel] flineitem = lineitem[lsel] jn1 = fpart.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") jn2 = jn1.merge( flineitem, left_on=["PS_PARTKEY", "PS_SUPPKEY"], right_on=["L_PARTKEY", "L_SUPPKEY"], ) gb = jn2.groupby(["PS_PARTKEY", "PS_SUPPKEY", "PS_AVAILQTY"], as_index=False)[ "L_QUANTITY" ].sum() gbsel = gb.PS_AVAILQTY > (0.5 * gb.L_QUANTITY) fgb = gb[gbsel] jn3 = fgb.merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") jn4 = fnation.merge(jn3, left_on="N_NATIONKEY", right_on="S_NATIONKEY") jn4 = jn4.loc[:, ["S_NAME", "S_ADDRESS"]] total = jn4.sort_values("S_NAME").drop_duplicates() print(total) print("Q20 Execution time (s): ", time.time() - t1)
def q15(lineitem, supplier): t1 = time.time() lineitem_filtered = lineitem[ (lineitem["L_SHIPDATE"] >= pd.Timestamp('1996-01-01')) & (lineitem["L_SHIPDATE"] < (pd.Timestamp('1996-01-01') + pd.DateOffset(months=3)))] lineitem_filtered["REVENUE_PARTS"] = lineitem_filtered[ "L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"]) lineitem_filtered = lineitem_filtered.loc[:, ["L_SUPPKEY", "REVENUE_PARTS"]] revenue_table = lineitem_filtered.groupby( "L_SUPPKEY", as_index=False).agg(TOTAL_REVENUE=pd.NamedAgg( column="REVENUE_PARTS", aggfunc="sum")).rename( columns={"L_SUPPKEY": "SUPPLIER_NO"}, copy=False) max_revenue = revenue_table["TOTAL_REVENUE"].max() revenue_table = revenue_table[revenue_table["TOTAL_REVENUE"] == max_revenue] supplier_filtered = supplier.loc[:, [ "S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE" ]] total = supplier_filtered.merge(revenue_table, left_on="S_SUPPKEY", right_on="SUPPLIER_NO", how="inner") total = total.loc[:, [ "S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE" ]] print(total) print("Q15 Execution time (s): ", time.time() - t1)
def q12(lineitem, orders): t1 = time.time() date1 = pd.Timestamp("1994-01-01") date2 = pd.Timestamp("1995-01-01") sel = ( (lineitem.L_RECEIPTDATE < date2) & (lineitem.L_COMMITDATE < date2) & (lineitem.L_SHIPDATE < date2) & (lineitem.L_SHIPDATE < lineitem.L_COMMITDATE) & (lineitem.L_COMMITDATE < lineitem.L_RECEIPTDATE) & (lineitem.L_RECEIPTDATE >= date1) & ((lineitem.L_SHIPMODE == "MAIL") | (lineitem.L_SHIPMODE == "SHIP")) ) flineitem = lineitem[sel] jn = flineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") def g1(x): return ((x == "1-URGENT") | (x == "2-HIGH")).sum() def g2(x): return ((x != "1-URGENT") & (x != "2-HIGH")).sum() total = jn.groupby("L_SHIPMODE", as_index=False)["O_ORDERPRIORITY"].agg((g1, g2)) total = total.sort_values("L_SHIPMODE") print(total) print("Q12 Execution time (s): ", time.time() - t1)
def test_mixed_type_sort(): df = pd.DataFrame({ "Mixed": [pd.Timestamp("2017-02-02"), np.nan, 1e10, 5, "hey"], 1: [5, 4, "hello", 2, 1], }) spreadsheet = SpreadsheetWidget(df=df) spreadsheet._handle_view_msg_helper({ "type": "change_sort", "sort_field": "Mixed", "sort_ascending": True }) expected_order_mixed = [1e10, pd.Timestamp("2017-02-02"), 5, "hey", np.nan] sorted_order_mixed = list(spreadsheet.get_changed_df()["Mixed"]) assert expected_order_mixed[:4] == sorted_order_mixed[:4] # np.nan != np.nan by definition, so check if value at index 4 are both np.nan assert np.isnan(expected_order_mixed[4]) and np.isnan( sorted_order_mixed[4]) # check sorting on number column names works spreadsheet._handle_view_msg_helper({ "type": "change_sort", "sort_field": 1, "sort_ascending": True }) expected_order_1 = [1, 2, 4, 5, "hello"] sorted_order_1 = list(spreadsheet.get_changed_df()[1]) assert expected_order_1 == sorted_order_1
def test_edit_date(): view = SpreadsheetWidget(df=create_df()) check_edit_success( view, "Date", 3, pd.Timestamp("2013-01-02 00:00:00"), "2013-01-02T00:00:00.000Z", pd.Timestamp("2013-01-16 00:00:00"), "2013-01-16T00:00:00.000Z", )
def test_squeeze(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 10, 11], "col4": [12, 13, 14, 15], "col5": [0, 0, 0, 0], } frame_data_2 = {"col1": [0, 1, 2, 3]} frame_data_3 = { "col1": [0], "col2": [4], "col3": [8], "col4": [12], "col5": [0], } frame_data_4 = {"col1": [2]} frame_data_5 = {"col1": ["string"]} # Different data for different cases pandas_df = pandas.DataFrame(frame_data).squeeze() modin_df = pd.DataFrame(frame_data).squeeze() df_equals(modin_df, pandas_df) pandas_df_2 = pandas.DataFrame(frame_data_2).squeeze() modin_df_2 = pd.DataFrame(frame_data_2).squeeze() df_equals(modin_df_2, pandas_df_2) pandas_df_3 = pandas.DataFrame(frame_data_3).squeeze() modin_df_3 = pd.DataFrame(frame_data_3).squeeze() df_equals(modin_df_3, pandas_df_3) pandas_df_4 = pandas.DataFrame(frame_data_4).squeeze() modin_df_4 = pd.DataFrame(frame_data_4).squeeze() df_equals(modin_df_4, pandas_df_4) pandas_df_5 = pandas.DataFrame(frame_data_5).squeeze() modin_df_5 = pd.DataFrame(frame_data_5).squeeze() df_equals(modin_df_5, pandas_df_5) data = [ [ pd.Timestamp("2019-01-02"), pd.Timestamp("2019-01-03"), pd.Timestamp("2019-01-04"), pd.Timestamp("2019-01-05"), ], [1, 1, 1, 2], ] df = pd.DataFrame(data, index=["date", "value"]).T pf = pandas.DataFrame(data, index=["date", "value"]).T df.set_index("date", inplace=True) pf.set_index("date", inplace=True) df_equals(df.iloc[0], pf.iloc[0])
def q04(lineitem, orders): t1 = time.time() date1 = pd.Timestamp("1993-11-01") date2 = pd.Timestamp("1993-08-01") lsel = lineitem.L_COMMITDATE < lineitem.L_RECEIPTDATE osel = (orders.O_ORDERDATE < date1) & (orders.O_ORDERDATE >= date2) flineitem = lineitem[lsel] forders = orders[osel] jn = forders[forders["O_ORDERKEY"].isin(flineitem["L_ORDERKEY"])] total = (jn.groupby("O_ORDERPRIORITY", as_index=False)["O_ORDERKEY"].count().sort_values( ["O_ORDERPRIORITY"])) print(total) print("Q04 Execution time (s): ", time.time() - t1)
def q14(lineitem, part): t1 = time.time() startDate = pd.Timestamp("1994-03-01") endDate = pd.Timestamp("1994-04-01") p_type_like = "PROMO" part_filtered = part.loc[:, ["P_PARTKEY", "P_TYPE"]] lineitem_filtered = lineitem.loc[:, ["L_EXTENDEDPRICE", "L_DISCOUNT", "L_SHIPDATE", "L_PARTKEY"]] sel = (lineitem_filtered.L_SHIPDATE >= startDate) & (lineitem_filtered.L_SHIPDATE < endDate) flineitem = lineitem_filtered[sel] jn = flineitem.merge(part_filtered, left_on="L_PARTKEY", right_on="P_PARTKEY") jn["TMP"] = jn.L_EXTENDEDPRICE * (1.0 - jn.L_DISCOUNT) total = jn[jn.P_TYPE.str.startswith(p_type_like)].TMP.sum() * 100 / jn.TMP.sum() print(total) print("Q14 Execution time (s): ", time.time() - t1)
def q03(lineitem, orders, customer): t1 = time.time() date = pd.Timestamp("1995-03-04") lineitem_filtered = lineitem.loc[:, ["L_ORDERKEY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_SHIPDATE"]] orders_filtered = orders.loc[:, ["O_ORDERKEY", "O_CUSTKEY", "O_ORDERDATE", "O_SHIPPRIORITY"]] customer_filtered = customer.loc[:, ["C_MKTSEGMENT", "C_CUSTKEY"]] lsel = lineitem_filtered.L_SHIPDATE > date osel = orders_filtered.O_ORDERDATE < date csel = customer_filtered.C_MKTSEGMENT == "HOUSEHOLD" flineitem = lineitem_filtered[lsel] forders = orders_filtered[osel] fcustomer = customer_filtered[csel] jn1 = fcustomer.merge(forders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") jn2 = jn1.merge(flineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") jn2["TMP"] = jn2.L_EXTENDEDPRICE * (1 - jn2.L_DISCOUNT) total = ( jn2.groupby(["L_ORDERKEY", "O_ORDERDATE", "O_SHIPPRIORITY"], as_index=False)[ "TMP" ] .sum() .sort_values(["TMP"], ascending=False) ) res = total.loc[:, ["L_ORDERKEY", "TMP", "O_ORDERDATE", "O_SHIPPRIORITY"]] print(res.head(10)) print("Q03 Execution time (s): ", time.time() - t1)
def q06(lineitem): t1 = time.time() date1 = pd.Timestamp("1996-01-01") date2 = pd.Timestamp("1997-01-01") lineitem_filtered = lineitem.loc[:, ["L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_SHIPDATE"]] sel = ( (lineitem_filtered.L_SHIPDATE >= date1) & (lineitem_filtered.L_SHIPDATE < date2) & (lineitem_filtered.L_DISCOUNT >= 0.08) & (lineitem_filtered.L_DISCOUNT <= 0.1) & (lineitem_filtered.L_QUANTITY < 24) ) flineitem = lineitem_filtered[sel] total = (flineitem.L_EXTENDEDPRICE * flineitem.L_DISCOUNT).sum() print(total) print("Q06 Execution time (s): ", time.time() - t1)
def test_add_row_button(): widget = SpreadsheetWidget(df=create_df()) event_history = init_event_history("row_added", widget=widget) widget._handle_view_msg_helper({"type": "add_row"}) assert event_history == [{ "name": "row_added", "index": 4, "source": "gui" }] # make sure the added row in the internal dataframe contains the # expected values added_index = event_history[0]["index"] expected_values = pd.Series({ "modin_spreadsheet_unfiltered_index": 4, "A": 1, "C": 1, "D": 3, "Date": pd.Timestamp("2013-01-02 00:00:00"), "E": "bar", "F": "fox", }) sort_idx = widget._df.loc[added_index].index assert (widget._df.loc[added_index] == expected_values[sort_idx]).all()
def test_to_datetime(): # DataFrame input for to_datetime modin_df = pd.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) pandas_df = pandas.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) df_equals(pd.to_datetime(modin_df), pandas.to_datetime(pandas_df)) # Series input for to_datetime modin_s = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) pandas_s = pandas.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) df_equals(pd.to_datetime(modin_s), pandas.to_datetime(pandas_s)) # Other inputs for to_datetime value = 1490195805 assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value, unit="s") value = 1490195805433502912 assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value, unit="ns") value = [1, 2, 3] assert pd.to_datetime( value, unit="D", origin=pd.Timestamp("2000-01-01")).equals( pandas.to_datetime(value, unit="D", origin=pandas.Timestamp("2000-01-01")))
def get_test_data(): return { "A": 1.0, "B": pd.Timestamp("20130102"), "C": pd.Series(1, index=list(range(4)), dtype="float32"), "D": np.array([3] * 4, dtype="int32"), "E": pd.Categorical(["test", "train", "foo", "bar"]), "F": ["foo", "bar", "buzz", "fox"], }
def q05(lineitem, orders, customer, nation, region, supplier): t1 = time.time() date1 = pd.Timestamp("1996-01-01") date2 = pd.Timestamp("1997-01-01") rsel = region.R_NAME == "ASIA" osel = (orders.O_ORDERDATE >= date1) & (orders.O_ORDERDATE < date2) forders = orders[osel] fregion = region[rsel] jn1 = fregion.merge(nation, left_on="R_REGIONKEY", right_on="N_REGIONKEY") jn2 = jn1.merge(customer, left_on="N_NATIONKEY", right_on="C_NATIONKEY") jn3 = jn2.merge(forders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") jn4 = jn3.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") jn5 = supplier.merge( jn4, left_on=["S_SUPPKEY", "S_NATIONKEY"], right_on=["L_SUPPKEY", "N_NATIONKEY"] ) jn5["TMP"] = jn5.L_EXTENDEDPRICE * (1.0 - jn5.L_DISCOUNT) gb = jn5.groupby("N_NAME", as_index=False)["TMP"].sum() total = gb.sort_values("TMP", ascending=False) print("Q05 Execution time (s): ", time.time() - t1)
def create_interval_index_df(): td = np.cumsum(np.random.randint(1, 15 * 60, 1000)) start = pd.Timestamp("2017-04-17") df = pd.DataFrame( [(start + pd.Timedelta(seconds=d)) for d in td], columns=["time"], dtype="M8[ns]", ) df["time_bin"] = np.cumsum(np.random.randint(1, 15 * 60, 1000)) return df
def test_to_datetime(): value = 1490195805 assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value, unit="s") value = 1490195805433502912 assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value, unit="ns") value = [1, 2, 3] assert pd.to_datetime( value, unit="D", origin=pd.Timestamp("2000-01-01")).equals( pandas.to_datetime(value, unit="D", origin=pandas.Timestamp("2000-01-01")))
def test_nans(): df = pd.DataFrame([(pd.Timestamp("2017-02-02"), np.nan), (4, 2), ("foo", "bar")]) view = SpreadsheetWidget(df=df) view._handle_view_msg_helper({ "type": "change_sort", "sort_field": 1, "sort_ascending": True }) view._handle_view_msg_helper({ "type": "show_filter_dropdown", "field": 1, "search_val": None })
def q07(lineitem, supplier, orders, customer, nation): """ This version is faster than q07_old. Keeping the old one for reference """ t1 = time.time() lineitem_filtered = lineitem[(lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01")) & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01"))] lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].apply(lambda x: x.year) lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"]) lineitem_filtered = lineitem_filtered.loc[:, ["L_ORDERKEY", "L_SUPPKEY", "L_YEAR", "VOLUME"]] supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]] orders_filtered = orders.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]] customer_filtered = customer.loc[:, ["C_CUSTKEY", "C_NATIONKEY"]] n1 = nation[(nation["N_NAME"] == "FRANCE")].loc[:, ["N_NATIONKEY", "N_NAME"]] n2 = nation[(nation["N_NAME"] == "GERMANY")].loc[:, ["N_NATIONKEY", "N_NAME"]] # ----- do nation 1 ----- N1_C = customer_filtered.merge(n1, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner') N1_C = N1_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"}) N1_C_O = N1_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner') N1_C_O = N1_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"]) # NOTE: this is faster than first merging lineitem with N1_C_O N2_S = supplier_filtered.merge(n2, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') N2_S = N2_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"}) N2_S_L = N2_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner') N2_S_L = N2_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"]) total1 = N1_C_O.merge(N2_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner') total1 = total1.drop(columns=["O_ORDERKEY", "L_ORDERKEY"]) # ----- do nation 2 ----- (same as nation 1 section but with nation 2) N2_C = customer_filtered.merge(n2, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner') N2_C = N2_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"}) N2_C_O = N2_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner') N2_C_O = N2_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"]) N1_S = supplier_filtered.merge(n1, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner') N1_S = N1_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"}) N1_S_L = N1_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner') N1_S_L = N1_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"]) total2 = N2_C_O.merge(N1_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner') total2 = total2.drop(columns=["O_ORDERKEY", "L_ORDERKEY"]) # concat results total = pd.concat([total1, total2]) total = total.groupby(["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index = False).agg(REVENUE=pd.NamedAgg(column="VOLUME", aggfunc="sum")) total = total.sort_values(by=["SUPP_NATION","CUST_NATION","L_YEAR"], ascending=[True,True,True,]) print(total) print("Q07 Execution time (s): ", time.time() - t1)
def create_df(): return pd.DataFrame({ "A": 1.0, "Date": pd.Timestamp("20130102"), "C": pd.Series(1, index=list(range(4)), dtype="float32"), "D": np.array([3] * 4, dtype="int32"), "E": pd.Categorical(["test", "train", "foo", "bar"]), "F": ["foo", "bar", "buzz", "fox"], })
def test_unique(): modin_result = pd.unique([2, 1, 3, 3]) pandas_result = pandas.unique([2, 1, 3, 3]) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique(pd.Series([2] + [1] * 5)) pandas_result = pandas.unique(pandas.Series([2] + [1] * 5)) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique( pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]) ) pandas_result = pandas.unique( pandas.Series([pandas.Timestamp("20160101"), pandas.Timestamp("20160101")]) ) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique( pd.Series( [ pd.Timestamp("20160101", tz="US/Eastern"), pd.Timestamp("20160101", tz="US/Eastern"), ] ) ) pandas_result = pandas.unique( pandas.Series( [ pandas.Timestamp("20160101", tz="US/Eastern"), pandas.Timestamp("20160101", tz="US/Eastern"), ] ) ) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique( pd.Index( [ pd.Timestamp("20160101", tz="US/Eastern"), pd.Timestamp("20160101", tz="US/Eastern"), ] ) ) pandas_result = pandas.unique( pandas.Index( [ pandas.Timestamp("20160101", tz="US/Eastern"), pandas.Timestamp("20160101", tz="US/Eastern"), ] ) ) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique(pd.Series(pd.Categorical(list("baabc")))) pandas_result = pandas.unique(pandas.Series(pandas.Categorical(list("baabc")))) assert_array_equal(modin_result, pandas_result)
def q01(lineitem): t1 = time.time() date = pd.Timestamp("1998-09-02") print(list(lineitem.columns.values)) lineitem_filtered = lineitem.loc[:, [ "L_ORDERKEY", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE" ]] sel = lineitem_filtered.L_SHIPDATE <= date lineitem_filtered = lineitem_filtered[sel] lineitem_filtered["AVG_QTY"] = lineitem_filtered.L_QUANTITY lineitem_filtered["AVG_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE lineitem_filtered["DISC_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE * ( 1 - lineitem_filtered.L_DISCOUNT) lineitem_filtered["CHARGE"] = (lineitem_filtered.L_EXTENDEDPRICE * (1 - lineitem_filtered.L_DISCOUNT) * (1 + lineitem_filtered.L_TAX)) #ray needs double square bracket gb = lineitem_filtered.groupby(["L_RETURNFLAG", "L_LINESTATUS"], as_index=False)[[ "L_ORDERKEY", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "AVG_QTY", "AVG_PRICE", "CHARGE", "DISC_PRICE", ]] total = gb.agg({ "L_QUANTITY": "sum", "L_EXTENDEDPRICE": "sum", "DISC_PRICE": "sum", "CHARGE": "sum", "AVG_QTY": "mean", "AVG_PRICE": "mean", "L_DISCOUNT": "mean", "L_ORDERKEY": "count", }) total = total.sort_values(["L_RETURNFLAG", "L_LINESTATUS"]) print(total) print("Q01 Execution time (s): ", time.time() - t1)
def q08(part, lineitem, supplier, orders, customer, nation, region): t1 = time.time() part_filtered = part[(part["P_TYPE"] == "ECONOMY ANODIZED STEEL")] part_filtered = part_filtered.loc[:, ["P_PARTKEY"]] lineitem_filtered = lineitem.loc[:, ["L_PARTKEY", "L_SUPPKEY", "L_ORDERKEY"]] lineitem_filtered["VOLUME"] = lineitem["L_EXTENDEDPRICE"] * (1.0 - lineitem["L_DISCOUNT"]) total = part_filtered.merge(lineitem_filtered, left_on="P_PARTKEY", right_on="L_PARTKEY", how="inner") total = total.loc[:, ["L_SUPPKEY", "L_ORDERKEY", "VOLUME"]] supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]] total = total.merge(supplier_filtered, left_on="L_SUPPKEY", right_on="S_SUPPKEY", how="inner") total = total.loc[:, ["L_ORDERKEY", "VOLUME", "S_NATIONKEY"]] orders_filtered = orders[(orders["O_ORDERDATE"] >= pd.Timestamp("1995-01-01")) & (orders["O_ORDERDATE"] < pd.Timestamp("1997-01-01"))] orders_filtered["O_YEAR"] = orders_filtered["O_ORDERDATE"].apply(lambda x: x.year) orders_filtered = orders_filtered.loc[:, ["O_ORDERKEY", "O_CUSTKEY", "O_YEAR"]] total = total.merge(orders_filtered, left_on="L_ORDERKEY", right_on="O_ORDERKEY", how="inner") total = total.loc[:, ["VOLUME", "S_NATIONKEY", "O_CUSTKEY", "O_YEAR"]] customer_filtered = customer.loc[:, ["C_CUSTKEY", "C_NATIONKEY"]] total = total.merge(customer_filtered, left_on="O_CUSTKEY", right_on="C_CUSTKEY", how="inner") total = total.loc[:, ["VOLUME", "S_NATIONKEY", "O_YEAR", "C_NATIONKEY"]] n1_filtered = nation.loc[:, ["N_NATIONKEY", "N_REGIONKEY"]] n2_filtered = nation.loc[:, ["N_NATIONKEY", "N_NAME"]].rename(columns={"N_NAME": "NATION"}) total = total.merge(n1_filtered, left_on="C_NATIONKEY", right_on="N_NATIONKEY", how="inner") total = total.loc[:, ["VOLUME", "S_NATIONKEY", "O_YEAR", "N_REGIONKEY"]] total = total.merge(n2_filtered, left_on="S_NATIONKEY", right_on="N_NATIONKEY", how="inner") total = total.loc[:, ["VOLUME", "O_YEAR", "N_REGIONKEY", "NATION"]] region_filtered = region[(region["R_NAME"] == "AMERICA")] region_filtered = region_filtered.loc[:, ["R_REGIONKEY"]] total = total.merge(region_filtered, left_on="N_REGIONKEY", right_on="R_REGIONKEY", how="inner") total = total.loc[:, ["VOLUME", "O_YEAR", "NATION"]] def udf(df): demonimator = df["VOLUME"].sum() df = df[df["NATION"] == "BRAZIL"] numerator = df["VOLUME"].sum() return numerator / demonimator #modin returns empty column with as_index=false total = total.groupby("O_YEAR").apply(udf).reset_index() total.columns = ["O_YEAR", "MKT_SHARE"] total = total.sort_values(by=["O_YEAR",], ascending=[True,]) print(total) print("Q08 Execution time (s): ", time.time() - t1)
def test_merge_asof_merge_options(): modin_quotes = pd.DataFrame( { "time": [ pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.030"), pd.Timestamp("2016-05-25 13:30:00.041"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.049"), pd.Timestamp("2016-05-25 13:30:00.072"), pd.Timestamp("2016-05-25 13:30:00.075"), ], "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], } ) modin_trades = pd.DataFrame( { "time": [ pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.038"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.048"), ], "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], "price": [51.95, 51.95, 720.77, 720.92, 98.0], "quantity": [75, 155, 100, 100, 100], } ) pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas(modin_trades) # left_by + right_by df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", left_by="ticker", right_by="ticker2", ), pd.merge_asof( modin_quotes, modin_trades, on="time", left_by="ticker", right_by="ticker2", ), ) # Just by: pandas_trades["ticker"] = pandas_trades["ticker2"] modin_trades["ticker"] = modin_trades["ticker2"] df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", ), pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", ), ) # Tolerance df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", tolerance=pd.Timedelta("2ms"), ), pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", tolerance=pd.Timedelta("2ms"), ), ) # Direction df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", direction="forward", ), pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", direction="forward", ), ) # Allow exact matches df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", tolerance=pd.Timedelta("10ms"), allow_exact_matches=False, ), pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", tolerance=pd.Timedelta("10ms"), allow_exact_matches=False, ), )