def group_partkey_avg_quantity_5_op(name, query_plan): """with lineitem_part_avg_group as (select avg(l_quantity) from part_lineitem_join group by l_partkey) :param query_plan: :param name: :return: """ def groupby_fn(df): df['l_quantity'] = df['l_quantity'].astype(np.float) grouped = df.groupby('l_partkey') # agg_df = grouped['l_quantity'].sum() agg_df = pd.DataFrame({ 'sum_l_quantity_computed00': grouped['l_quantity'].sum(), 'cnt_l_quantity_computed00': grouped['l_quantity'].size() }) return agg_df.reset_index() return Group( ['l_partkey'], # l_partkey [ # avg(l_quantity) # AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['l_quantity'])) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[5])) ], name, query_plan, False, groupby_fn)
def groupby_reduce_returnflag_linestatus_operator_def(name, query_plan, use_pandas): agger = OrderedDict([ ('sum_qty', np.sum), ('sum_base_price', np.sum), ('sum_disc_price', np.sum), ('sum_charge', np.mean), ('avg_qty', np.mean), ('avg_price', np.mean), ('avg_disc', np.mean), ('count_order', np.sum), ]) def fn(df): grouped = df.groupby(['l_returnflag', 'l_linestatus'], as_index=False) # type: DataFrameGroupBy agg_df = grouped.agg(agger) return agg_df return Group( ['l_returnflag', 'l_linestatus'] if use_pandas else ['_0', '_1'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[2]), 'sum_qty'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[3]), 'sum_base_price'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[4]), 'sum_disc_price'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[5]), 'sum_charge'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[6]), 'avg_qty'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[7]), 'avg_price'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[8]), 'avg_disc'), AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[9]), '__count'), ], name, query_plan, False, fn)
def test_group_count(): """Tests a group by query with a count aggregate :return: None """ num_rows = 0 query_plan = QueryPlan() # Query plan # select s_nationkey, count(s_suppkey) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group( ['_3'], [ AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) # count(s_suppkey) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results for _ in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 25 + 1 nation_24 = filter( lambda t: IndexedTuple.build(t, field_names)['_0'] == '24', c.tuples())[0] assert nation_24[1] == 393 assert num_rows == 25 + 1 # Write the metrics query_plan.print_metrics()
def test_group_sum(): """Tests a group by query with a sum aggregate :return: None """ query_plan = QueryPlan() # Query plan # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group(['_3'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t_)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 25 + 1 nation_24 = filter( lambda t_: IndexedTuple.build(t_, field_names)['_0'] == '24', c.tuples())[0] assert round(nation_24[1], 2) == 1833872.56 # Write the metrics query_plan.print_metrics()
def test_group_empty(): """Executes a group where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey ts = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'ts', query_plan, False)) g = query_plan.add_operator( Group(['_3'], [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])) ], 'g', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts.connect(g) g.connect(c) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 0 + 1 # Write the metrics query_plan.print_metrics()
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Bloom Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_), "select " " c_custkey " "from " " S3Object " "where " " c_mktsegment = 'BUILDING' " " {} " " {} " .format( ' and ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=False)), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'c_custkey'}, inplace=True) return df customer_project = map(lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) customer_bloom_create = query_plan.add_operator( BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False, fp_rate)) customer_map = map(lambda p: query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)), range(0, customer_parts)) order_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_), "select " " o_custkey, o_orderkey, o_orderdate, o_shippriority " "from " " S3Object " "where " " cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=False)), 'o_custkey', format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'}, inplace=True) return df order_project = map(lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) order_map_1 = map(lambda p: query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) order_bloom_create = query_plan.add_operator( BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False, fp_rate)) lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_), "select " " l_orderkey, l_extendedprice, l_discount " "from " " S3Object " "where " " cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '', get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=False)), 'l_orderkey', format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_1', '_2'], axis=1) df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'}, inplace=True) return df lineitem_project = map(lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) order_map_2 = map(lambda p: query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map(lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_map) connect_many_to_one(customer_project, customer_bloom_create) connect_one_to_many(customer_bloom_create, order_scan) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_one(order_project, order_bloom_create) connect_one_to_many(order_bloom_create, lineitem_scan) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) def fn(df): df.columns = [ 'G0', 'G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'F0', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', ] df = df.filter(items=group_fields + agg_fields, axis=1) return df # Scan scan = map( lambda p: query_plan.add_operator( TableScan(files.format(p), use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False, fn=fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) # inlined map(lambda p: p.set_async(False), groupby) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) connect_many_to_many(scan, groupby) connect_many_to_one(groupby, groupby_reduce) connect_one_to_one(groupby_reduce, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) ########################## ## Phase 1. Find out group names ########################## # Scan scan_phase1 = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields)), format_, use_pandas, secure, use_native, 'scan_phase1_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields return df project_exprs = [ ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields) ] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): return df.drop_duplicates() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) # GroupbyFilterBuild agg_exprs = [('SUM', 'CAST({} AS float)'.format(agg_field)) for agg_field in agg_fields] groupby_filter_build = query_plan.add_operator( GroupbyFilterBuild(group_fields, agg_fields, agg_exprs, 'groupby_filter_build', query_plan, False)) ########################## ## Phase 2. Perform aggregation at S3. ########################## # Scan scan_phase2 = map( lambda p: query_plan.add_operator( SQLTableScan( 'groupby_benchmark/shards-10GB/groupby_data_{}.csv'.format( p), "", format_, use_pandas, secure, use_native, 'scan_phase2_{}'.format(p), query_plan, False)), range(0, table_parts)) groupby_decoder = map( lambda p: query_plan.add_operator( GroupbyDecoder(agg_fields, 'groupby_decoder_{}'.format(p), query_plan, False)), range(0, table_parts)) def groupby_fn_phase2(df): #print df df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby_reduce_phase2 = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce_phase2', query_plan, False, groupby_fn_phase2)) #scan_phase1[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase1_0" + ".prof")) #scan_phase2[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase2_0" + ".prof")) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # phase 1 map(lambda (p, o): o.connect(project[p]), enumerate(scan_phase1)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(groupby_filter_build) # phase 2 map(lambda (p, o): groupby_filter_build.connect(o, 0), enumerate(scan_phase2)) map(lambda (p, o): groupby_filter_build.connect(o, 1), enumerate(groupby_decoder)) map(lambda (p, o): o.connect(groupby_decoder[p]), enumerate(scan_phase2)) map(lambda (p, o): o.connect(groupby_reduce_phase2), enumerate(groupby_decoder)) # map(lambda (p, o): groupby_reduce.connect(o), enumerate(groupby_decoder)) groupby_reduce_phase2.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size, table_parts, files, format_): """ :return: None """ secure = False use_native = False print('') print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}". format(group_fields, agg_fields)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan( files.format(p), "select {} from S3Object;".format(','.join( group_fields + agg_fields)), format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = group_fields + agg_fields return df project_exprs = [ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields)] \ + [ProjectExpression(lambda t_: t_['_{}'.format(n + len(group_fields))], v) for n, v in enumerate(agg_fields)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # Groupby def groupby_fn(df): df[agg_fields] = df[agg_fields].astype(np.float) grouped = df.groupby(group_fields) agg_df = pd.DataFrame( {f: grouped[f].sum() for n, f in enumerate(agg_fields)}) return agg_df.reset_index() groupby = map( lambda p: query_plan.add_operator( Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False, groupby_fn)), range(0, table_parts)) groupby_reduce = query_plan.add_operator( Group(group_fields, [], 'groupby_reduce', query_plan, False, groupby_fn)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(groupby[p]), enumerate(project)) map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby)) groupby_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def groupby_returnflag_linestatus_operator_def(name, query_plan): def fn(df): convert_types(df) compute_expressions(df) grouped = group(df) agg_df = aggregate(grouped) return agg_df def aggregate(grouped): agg_df = pd.DataFrame({ 'sum_qty': grouped['l_quantity'].sum(), 'avg_qty': grouped['l_quantity'].mean(), 'count_order': grouped['l_quantity'].size(), 'sum_base_price': grouped['l_extendedprice'].sum(), 'avg_price': grouped['l_extendedprice'].mean(), 'avg_disc': grouped['l_discount'].mean(), 'sum_disc_price': grouped['c0'].sum(), 'sum_charge': grouped['c1'].sum() }) return agg_df.reset_index() def group(df): return df.groupby(['l_returnflag', 'l_linestatus']) def compute_expressions(df): df['c0'] = df['l_extendedprice'].mul(df['l_discount'].rsub(1)) df['c1'] = df['l_extendedprice'].mul(df['l_discount'].rsub(1)).mul(df['l_tax'].radd(1)) def convert_types(df): df['l_quantity'] = df['l_quantity'].astype(np.int) df[['l_extendedprice', 'l_discount', 'l_tax']] = \ df[['l_extendedprice', 'l_discount', 'l_tax']].astype(np.float) return Group( [ 'l_returnflag', 'l_linestatus' ], [ # 0 1 2 3 4 5 6 # ['l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate'] # sum(l_quantity) AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[0]), 'sum_qty'), # sum(l_extendedprice) as sum_base_price AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[1]), 'sum_base_price'), # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[1]) * (1 - float(t_[2])), 'sum_disc_price'), # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_[1]) * (1 - float(t_[2])) * (1 + float(t_[3])), 'sum_charge'), # avg(l_quantity) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[0]), 'avg_qty'), # avg(l_extendedprice) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[1]), 'avg_price'), # avg(l_discount) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_[2]), 'avg_disc'), # count(*) as count_order AggregateExpression(AggregateExpression.COUNT, lambda t_: t_[4], 'count_order') ], name, query_plan, False, fn)
def test_group_baseline(): """TPC-H Q1 :return: None """ # Query plan # select # l_returnflag, # l_linestatus, # sum(l_quantity) as sum_qty, # sum(l_extendedprice) as sum_base_price, # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, # avg(l_quantity) as avg_qty, # avg(l_extendedprice) as avg_price, # avg(l_discount) as avg_disc, # count(*) as count_order # from # lineitem # where # l_shipdate <= date '1992-04-01' - interval '[DELTA]' day(3) # group by # l_returnflag, # l_linestatus # order by # l_returnflag, # l_linestatus; query_plan = QueryPlan() delta_days = 60 # TODO: This is supposed to be randomized I think shipped_date = datetime.strptime('1992-04-01', '%Y-%m-%d') - timedelta(days=delta_days) ts = query_plan.add_operator(SQLTableScan("lineitem.csv", "select * from S3Object " "where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp)" .format(shipped_date.strftime('%Y-%m-%d')), False, 'lineitem', query_plan, False)) g = query_plan.add_operator(Group( [ '_8', # l_returnflag '_9' # l_linestatus ], [ # sum(l_quantity) AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_4'])), # sum(l_extendedprice) as sum_base_price AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])), # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5']) * (1 - float(t_['_6']))), # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5']) * (1 - float(t_['_6'])) * (1 + float(t_['_7']))), # avg(l_quantity) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_4'])), # avg(l_extendedprice) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_5'])), # avg(l_discount) AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_6'])), # count(*) as count_order AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0']) ], 'lineitem_grouped', query_plan, False)) s = query_plan.add_operator(Sort( [ SortExpression('_0', str, 'ASC'), SortExpression('_1', str, 'ASC') ], 'lineitem_group_sorted', query_plan, False)) c = query_plan.add_operator(Collate('collation', query_plan, False)) ts.connect(g) g.connect(s) s.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) field_names = ['_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9'] assert c.tuples()[0] == field_names assert len(c.tuples()) == 2 + 1 # These have been verified in Postgres assert c.tuples()[1] == ["A", "F", 129850, 194216048.19000033, 184525343.78730044, 191943492.96455324, 25.445816186556925, 38059.19031746035, 0.050005878894768374, 5103] assert c.tuples()[2] == ["R", "F", 129740, 193438367.33999985, 183701990.7670003, 191045646.36937532, 25.509241053873353, 38033.49731419579, 0.05061541486433399, 5086] # Write the metrics query_plan.print_metrics()