def test_join_topk(): """Tests a top k with a join :return: None """ limit = 5 query_plan = QueryPlan() # Query plan ts1 = query_plan.add_operator(SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts1', query_plan, False)) ts1_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'ts1_project', query_plan, False)) ts2 = query_plan.add_operator(SQLTableScan('nation.csv', 'select * from S3Object;', False, 'ts2', query_plan, False)) ts2_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'ts2_project', query_plan, False)) j = query_plan.add_operator(HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'j', query_plan, False)) t = query_plan.add_operator(Limit(limit, 't', query_plan, False)) c = query_plan.add_operator(Collate('c', query_plan, False)) ts1.connect(ts1_project) ts2.connect(ts2_project) j.connect_left_producer(ts1_project) j.connect_right_producer(ts2_project) j.connect(t) t.connect(c) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in c.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) c.print_tuples() field_names = ['s_nationkey', 'n_nationkey'] assert len(c.tuples()) == limit + 1 assert c.tuples()[0] == field_names num_rows = 0 for t in c.tuples(): num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['s_nationkey'] == lt['n_nationkey'] # Write the metrics query_plan.print_metrics()
def join_l_partkey_p_partkey_op(name, query_plan): """with part_lineitem_join_avg_group_join as ( select * from part_lineitem_join, lineitem_part_avg_group_project where p_partkey = l_partkey ) :return: """ return HashJoin(JoinExpression('l_partkey', 'p_partkey'), name, query_plan, False)
def join_p_partkey_l_partkey_op(name, query_plan): """with part_lineitem_join as (select * from part_scan, lineitem_scan where p_partkey = l_partkey) :param query_plan: :param name: :return: """ return HashJoin(JoinExpression('p_partkey', 'l_partkey'), name, query_plan, False)
def test_join_empty(): """Executes a join where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple should be present in the results. :return: None """ query_plan = QueryPlan() # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False, 'supplier_scan', query_plan, False)) supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'supplier_project', query_plan, False)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object limit 0;', False, 'nation_scan', query_plan, False)) nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'nation_project', query_plan, False)) supplier_nation_join = query_plan.add_operator( HashJoin(JoinExpression('s_nationkey', 'n_nationkey'), 'supplier_nation_join', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) supplier_nation_join.connect_left_producer(supplier_project) supplier_nation_join.connect_right_producer(nation_project) supplier_nation_join.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in collate.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) assert len(collate.tuples()) == 0 # Write the metrics query_plan.print_metrics()
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded): """ :return: None """ print('') print("TPCH Q14 Semi Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) part_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_partkey_where_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_1' + '_' + str(p), query_plan)), range(0, part_parts)) part_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_p_partkey_operator_def( 'part_1_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def( 'part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_1 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_1_project = map( lambda p: query_plan.add_operator( tpch_q14.project_l_partkey_operator_def( 'lineitem_1_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_1_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_1_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_1_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) join_bloom_create = map( lambda p: query_plan.add_operator( tpch_q14.bloom_create_l_partkey_operator_def( 'join_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) part_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_table_scan_2' + '_' + str(p), query_plan)), range(0, part_parts)) part_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_type_operator_def( 'part_2_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan_2 = map( lambda p: query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def( min_shipped_date, max_shipped_date, lineitem_sharded, p, use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str( p), query_plan)), range(0, lineitem_parts)) lineitem_2_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def( 'lineitem_2_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_2_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_2_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_2_probe' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1)) map(lambda (p, o): o.connect(part_bloom_create[p]), enumerate(part_1_project)), map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(part_bloom_create)), enumerate(lineitem_scan_1)) map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]), enumerate(part_1_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_1_build)), enumerate(part_lineitem_join_1_probe)) map(lambda (p, o): o.connect(lineitem_1_project[p]), enumerate(lineitem_scan_1)) map( lambda (p, o): part_lineitem_join_1_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_1_project)) map(lambda (p, o): o.connect(join_bloom_create[p]), enumerate(part_lineitem_join_1_probe)) map( lambda (p, o): map(lambda (bp, bo): o.connect(bo), enumerate(part_scan_2)), enumerate(join_bloom_create)) map( lambda (p, o): map(lambda (bp, bo): bo.connect(o), enumerate(join_bloom_create)), enumerate(lineitem_scan_2)) map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2)) map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]), enumerate(part_2_project)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(part_lineitem_join_2_build)), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(lineitem_2_project[p]), enumerate(lineitem_scan_2)) map( lambda (p, o): part_lineitem_join_2_probe[p % part_parts]. connect_tuple_producer(o), enumerate(lineitem_2_project)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(part_lineitem_join_2_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_where_brand_and_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_where_partkey_op( lineitem_parts != 1, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join', query_plan)), # range(0, lineitem_parts)) part_lineitem_join_avg_group_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild( 'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q19 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_bloom_create = query_plan.add_operator( tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan, fp_rate)) part_scan = map( lambda p: query_plan.add_operator( tpch_q19. sql_scan_part_partkey_brand_size_container_where_filtered_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_project = \ map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_filtered_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_bloom_create_map = map(lambda p: # query_plan.add_operator( # Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)), # range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) filter_op = map( lambda p: query_plan.add_operator( tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, other_parts)) aggregate = map( lambda p: query_plan.add_operator( tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan, use_pandas)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), filter_op) map(lambda o: o.set_async(False), aggregate) aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # part_project.connect(part_bloom_create) connect_many_to_one(part_project, part_bloom_create) # part_bloom_create.connect(lineitem_bloom_use) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # lineitem_bloom_use.connect(lineitem_project) connect_many_to_many(lineitem_bloom_use, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_all_to_all(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_operator_def( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str( p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) part_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) # part_filter = map(lambda p: # query_plan.add_operator( # tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), # range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() sum2_ = df['_1'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) connect_all_to_all(part_map, join_build) connect_many_to_many(join_build, join_probe) connect_many_to_many(lineitem_filter, lineitem_map) connect_all_to_all(lineitem_map, join_probe) connect_many_to_many(join_probe, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(lineitem_parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 15.4488836202 elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q14 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set. date = '1996-03-13' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) lineitem_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_lineitem_extra_filtered_operator_def( lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q14. project_partkey_extendedprice_discount_shipdate_operator_def( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map( lambda p: query_plan.add_operator( tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q14.project_partkey_brand_type_operator_def( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_shipdate_operator_def( min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_filter = map( lambda p: query_plan.add_operator( tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_aggregate = map( lambda p: query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def( 'part_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q14.collate_operator_def('collate', query_plan)) # Connect the operators map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project)) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter)) map( lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), enumerate(join_build)), enumerate(join_probe)) map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o), enumerate(lineitem_filter)) map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe)) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate)) aggregate_reduce.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 33.4262326420
def run(parallel, buffer_size): """Tests a with sharded operators and separate build and probe for join :return: None """ query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan parts = 2 collate = query_plan.add_operator(Collate('collate', query_plan, False)) merge = query_plan.add_operator(Merge('merge', query_plan, False)) hash_join_build_ops = [] hash_join_probe_ops = [] for p in range(1, parts + 1): r_region_key_lower = math.ceil((5.0 / float(parts)) * (p - 1)) r_region_key_upper = math.ceil((5.0 / float(parts)) * p) region_scan = query_plan.add_operator( SQLTableScan('region.csv', 'select * ' 'from S3Object ' 'where cast(r_regionkey as int) >= {} and cast(r_regionkey as int) < {};' .format(r_region_key_lower, r_region_key_upper), False, 'region_scan' + '_' + str(p), query_plan, False)) region_project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'r_regionkey'), ProjectExpression(lambda t_: t_['_1'], 'r_name') ], 'region_project' + '_' + str(p), query_plan, False)) n_nation_key_lower = math.ceil((25.0 / float(parts)) * (p - 1)) n_nation_key_upper = math.ceil((25.0 / float(parts)) * p) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object ' 'where cast(n_nationkey as int) >= {} and cast(n_nationkey as int) < {};' .format(n_nation_key_lower, n_nation_key_upper), False, 'nation_scan' + '_' + str(p), query_plan, False)) nation_project = query_plan.add_operator( Project([ ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'), ProjectExpression(lambda t_: t_['_1'], 'n_name'), ProjectExpression(lambda t_: t_['_2'], 'n_regionkey') ], 'nation_project' + '_' + str(p), query_plan, False)) region_hash_join_build = query_plan.add_operator( HashJoinBuild('r_regionkey', 'region_hash_join_build' + '_' + str(p), query_plan, False)) hash_join_build_ops.append(region_hash_join_build) region_nation_join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('r_regionkey', 'n_regionkey'), 'region_nation_join_probe' + '_' + str(p), query_plan, False)) hash_join_probe_ops.append(region_nation_join_probe) region_scan.connect(region_project) nation_scan.connect(nation_project) region_project.connect(region_hash_join_build) region_nation_join_probe.connect_tuple_producer(nation_project) region_nation_join_probe.connect(merge) for probe_op in hash_join_probe_ops: for build_op in hash_join_build_ops: probe_op.connect_build_producer(build_op) merge.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['r_regionkey', 'r_name', 'n_nationkey', 'n_name', 'n_regionkey'] assert len(tuples) == 25 + 1 assert tuples[0] == field_names num_rows = 0 for t in tuples: num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['r_regionkey'] == lt['n_regionkey']
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Bloom Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_), "select " " c_custkey " "from " " S3Object " "where " " c_mktsegment = 'BUILDING' " " {} " " {} " .format( ' and ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=False)), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'c_custkey'}, inplace=True) return df customer_project = map(lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) customer_bloom_create = query_plan.add_operator( BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False, fp_rate)) customer_map = map(lambda p: query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)), range(0, customer_parts)) order_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_), "select " " o_custkey, o_orderkey, o_orderdate, o_shippriority " "from " " S3Object " "where " " cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=False)), 'o_custkey', format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'}, inplace=True) return df order_project = map(lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) order_map_1 = map(lambda p: query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) order_bloom_create = query_plan.add_operator( BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False, fp_rate)) lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_), "select " " l_orderkey, l_extendedprice, l_discount " "from " " S3Object " "where " " cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '', get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=False)), 'l_orderkey', format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_1', '_2'], axis=1) df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'}, inplace=True) return df lineitem_project = map(lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) order_map_2 = map(lambda p: query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map(lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_map) connect_many_to_one(customer_project, customer_bloom_create) connect_one_to_many(customer_bloom_create, order_scan) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_one(order_project, order_bloom_create) connect_one_to_many(order_bloom_create, lineitem_scan) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts, sharded): """ :return: None """ print('') print("TPCH Q19 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Define the operators lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q19.sql_scan_lineitem_select_all_where_partkey_op(sharded, p, lineitem_parts, use_pandas, 'lineitem_scan' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_scan = map(lambda p: query_plan.add_operator( tpch_q19.sql_scan_part_select_all_where_partkey_op(sharded, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q19.project_partkey_brand_size_container_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) # lineitem_part_join = map(lambda p: # query_plan.add_operator(tpch_q19.join_op(query_plan)), # range(0, part_parts)) lineitem_part_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'lineitem_partjoin_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_part_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'lineitem_part_join_probe' + '_' + str(p), query_plan, True)), range(0, part_parts)) filter_op = map(lambda p: query_plan.add_operator(tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate = map(lambda p: query_plan.add_operator(tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q19.aggregate_project_def('aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Connect the operators # lineitem_scan.connect(lineitem_project) connect_many_to_many(lineitem_scan, lineitem_project) connect_all_to_all(lineitem_project, lineitem_map) # part_scan.connect(part_project) connect_many_to_many(part_scan, part_project) connect_many_to_many(part_project, part_map) # lineitem_part_join.connect_left_producer(lineitem_project) connect_all_to_all(part_map, lineitem_part_join_build) # lineitem_part_join.connect_right_producer(part_project) connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe) connect_many_to_many(lineitem_map, lineitem_part_join_probe) # lineitem_part_join.connect(filter_op) connect_many_to_many(lineitem_part_join_probe, filter_op) # filter_op.connect(aggregate) connect_many_to_many(filter_op, aggregate) # aggregate.connect(aggregate_project) connect_many_to_one(aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) # aggregate_project.connect(collate) connect_one_to_one(aggregate_project, collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL numpy.testing.assert_almost_equal(tuples[1], 92403.0667)
def query_plan(settings): # type: (SyntheticBloomJoinSettings) -> QueryPlan """ :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fn(df): df.columns = settings.table_A_field_names return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_A_field_names), settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fn)), range(0, settings.table_A_parts)) """ field_names_map_A = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names)) def project_fn_A(df): df.rename(columns=field_names_map_A, inplace=True) return df project_A = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()], 'project_A_{}'.format(p), query_plan, True, project_fn_A)), range(0, settings.table_A_parts)) """ bloom_create_a = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'. format(p), query_plan, False)), range(0, settings.table_A_parts)) def scan_B_fn(df): df.columns = settings.table_B_field_names return df scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_B_field_names), settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.table_B_AB_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fn)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) """ def scan_C_fn(df): df.columns = settings.table_C_field_names return df scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fn)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_ab = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype(float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(scan_A, project_A) #connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) #connect_many_to_many(project_A, bloom_create_a) connect_many_to_many(scan_A, bloom_create_a) connect_all_to_all(bloom_create_a, scan_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(scan_B, project_B) #connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) connect_many_to_many(join_probe_A_B, bloom_create_ab) connect_all_to_all(bloom_create_ab, scan_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(scan_C, project_C) #connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' + str(p), query_plan)), range(0, part_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_bloom_create_map = map(lambda p: query_plan.add_operator( Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_lineitem_join_build_map = map(lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, True)), range(0, part_parts)) part_bloom_create = map(lambda p: query_plan.add_operator( tpch_q17.bloom_create_partkey_op('part_bloom_create' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_bloom_use = map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op( part_parts != 1, p, part_parts, use_pandas, 'lineitem_bloom_use' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map(lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, True)), range(0, lineitem_parts)) # part_lineitem_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join' + '_' + str(p), query_plan)), # range(0, part_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, True)), range(0, part_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, part_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, part_parts)) # part_lineitem_join_avg_group_join = map(lambda p: # query_plan.add_operator( # tpch_q17.join_l_partkey_p_partkey_op( # 'part_lineitem_join_avg_group_join' + '_' + str(p), query_plan)), # range(0, part_parts)) part_lineitem_join_avg_group_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str( p), query_plan, False)), range(0, part_parts)) part_lineitem_join_avg_group_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'p_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str( p), query_plan, True)), range(0, part_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, part_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, part_parts)) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], 'aggregate_reduce', query_plan, False)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) # part_project.connect(part_bloom_create) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL assert round(float(tuples[1][0]), 10) == 4632.1085714286
def run(parallel, buffer_size, parts): """ :return: None """ print('') print("TPCH Q14 Partitioned Bloom Join") print("-------------------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997]. date = '1993-01-01' min_shipped_date = datetime.strptime(date, '%Y-%m-%d') max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30) collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan)) sum_aggregate = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])), AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1'])) ], 'sum_aggregate', query_plan, False)) aggregate_project = query_plan.add_operator( tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan)) hash_join_build_ops = [] hash_join_probe_ops = [] part_bloom_create_ops = [] line_item_scan_ops = [] for p in range(0, parts): part_scan = query_plan.add_operator( tpch_q14.sql_scan_part_partkey_type_part_where_brand12_sharded_operator_def(p, parts, 'part_scan' + '_' + str(p), query_plan)) part_scan_project = query_plan.add_operator( tpch_q14.project_partkey_type_operator_def('part_scan_project' + '_' + str(p), query_plan)) part_bloom_create = query_plan.add_operator( tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create' + '_' + str(p), query_plan)) part_bloom_create_ops.append(part_bloom_create) lineitem_scan = query_plan.add_operator( tpch_q14.bloom_scan_lineitem_where_shipdate_sharded_operator_def(min_shipped_date, max_shipped_date, p, parts, 'lineitem_scan' + '_' + str(p), query_plan)) line_item_scan_ops.append(lineitem_scan) lineitem_scan_project = query_plan.add_operator( tpch_q14.project_partkey_extendedprice_discount_operator_def('lineitem_scan_project' + '_' + str(p), query_plan)) join_build = query_plan.add_operator( HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False)) hash_join_build_ops.append(join_build) join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p), query_plan, False)) hash_join_probe_ops.append(join_probe) aggregate = query_plan.add_operator( tpch_q14.aggregate_promo_revenue_operator_def('aggregate' + '_' + str(p), query_plan)) part_scan.connect(part_scan_project) lineitem_scan.connect(lineitem_scan_project) part_scan_project.connect(part_bloom_create) # part_bloom_create.connect(lineitem_scan) part_scan_project.connect(join_build) join_probe.connect(aggregate) # aggregate.connect(aggregate_project) aggregate.connect(sum_aggregate) join_probe.connect_tuple_producer(lineitem_scan_project) # join_probe.connect(merge) for bloom_create_op in part_bloom_create_ops: for scan_op in line_item_scan_ops: bloom_create_op.connect(scan_op) for probe_op in hash_join_probe_ops: for build_op in hash_join_build_ops: probe_op.connect_build_producer(build_op) sum_aggregate.connect(aggregate_project) aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print("parts: {}".format(parts)) print('') query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(parts)) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['promo_revenue'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL # assert tuples[1] == [15.090116526324298] assert round(float(tuples[1][0]), 10) == 15.0901165263
def join_op(query_plan): return HashJoin(JoinExpression('l_partkey', 'p_partkey'), 'lineitem_part_join', query_plan, False)
def join_part_lineitem_operator_def(name, query_plan): # type: (str, QueryPlan) -> HashJoin return HashJoin(JoinExpression('p_partkey', 'l_partkey'), name, query_plan, False)
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, expected_result, format_): """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3. This works by: 1. Scanning part and filtering on brand and container 2. It then scans lineitem 3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in step 1) 4. It then computes the average of l_quantity from the joined table and groups the results by partkey 5. It then joins these computed averages with the joined table in step 3 6. It then filters out any rows where l_quantity is less than the computed average TODO: There are few ways this can be done, the above is just one. :return: None """ print('') print("TPCH Q17 Baseline Join") print("----------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_part_select_all_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) lineitem_scan = map(lambda p: query_plan.add_operator( tpch_q17.sql_scan_lineitem_select_all_op( lineitem_sharded, p, lineitem_parts, use_pandas, secure, use_native, 'lineitem_scan' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) part_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_brand_container_op( 'part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_filter = map(lambda p: query_plan.add_operator(tpch_q17.filter_brand_container_op( 'part_filter' + '_' + str(p), query_plan)), range(0, part_parts)) part_map = map(lambda p: query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) lineitem_project = map(lambda p: query_plan.add_operator( tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map(lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map(lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map(lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map(lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), part_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter)) # lineitem_scan.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan)) map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project)) # part_lineitem_join.connect_left_producer(part_project) # part_lineitem_join.connect_right_producer(lineitem_project) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(lineitem_map)) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo), # enumerate(part_lineitem_join_avg_group_join_build)), # enumerate(part_lineitem_join_avg_group_join_probe)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round(float(tuples[1][0]), 10) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def query_plan(settings): # type: (SyntheticFilteredJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_A_field_names), ' where {} '.format( settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '', get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=settings.table_A_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_A = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_A(df): df = df.rename(columns=field_names_map_A, copy=False) return df project_A = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_A.iteritems() ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)), range(0, settings.table_A_parts)) scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {} " "from " " S3Object " " {} " " {} " .format(','.join(settings.table_B_field_names), ' where {} '.format( settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '', get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=settings.table_B_filter_sql is None)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_B = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_B.iteritems() ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)), range(0, settings.table_B_parts)) if settings.table_C_key is not None: scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan( get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(','.join(settings.table_C_field_names), settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_C = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_C(df): df = df.rename(columns=field_names_map_C, copy=False) return df project_C = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_C.iteritems() ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)), range(0, settings.table_C_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}'. format(p), query_plan, False)), range(0, settings.other_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.other_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_A) map(lambda o: o.set_async(False), project_B) map(lambda o: o.set_async(False), map_A_to_B) map(lambda o: o.set_async(False), map_B_to_B) if settings.table_C_key is not None: map(lambda o: o.set_async(False), map_B_to_C) map(lambda o: o.set_async(False), map_C_to_C) map(lambda o: o.set_async(False), project_C) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_A, project_A) connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, project_B) connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) if settings.table_C_key is None: connect_many_to_many(join_probe_A_B, part_aggregate) else: connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(scan_C, project_C) connect_many_to_many(project_C, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def query_plan(settings): # type: (SyntheticSemiJoinSettings) -> QueryPlan """ :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_a = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_A_AB_join_key, settings.table_A_filter_sql, get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_a' + '_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_a = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_A_field_names) ], settings.table_A_field_names)) def project_fn_a(df): df = df.rename(columns=field_names_map_a, copy=False) return df project_a = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_a.iteritems() ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a) ), range(0, settings.table_A_parts)) bloom_create_ab_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_A_AB_join_key, 'bloom_create_ab_join_key' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_A_parts)) scan_b_on_ab_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_BC_join_key, settings.table_B_AB_join_key, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_AB_join_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_b_on_ab_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: scan_b_detail_on_b_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_B_primary_key, settings.table_B_detail_field_name, settings.table_B_filter_sql, get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=False)), settings.format_, settings.table_B_primary_key, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_b_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b_detail = OrderedDict([ ('_0', settings.table_B_primary_key), ('_1', settings.table_B_detail_field_name) ]) def project_fn_b_detail(df): df.rename(columns=field_names_map_b_detail, inplace=True) return df project_b_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b_detail.iteritems() ], 'project_b_detail' + '_{}'.format(p), query_plan, False, project_fn_b_detail)), range(0, settings.table_B_parts)) map_b_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_b_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) bloom_create_b_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_primary_key, 'bloom_create_b_pk' + '_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_B_parts)) join_probe_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_primary_key, settings. table_B_primary_key), 'join_probe_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_build_ab_and_b_on_b_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_primary_key, 'join_build_ab_and_b_on_b_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) else: scan_c_on_bc_join_key = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {}, {} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_BC_join_key, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_BC_join_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_on_bc_join_key' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_C_field_names) ], settings.table_C_field_names)) def project_fn_c(df): df.rename(columns=field_names_map_c, inplace=True) return df project_c = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c.iteritems() ], 'project_c' + '_{}'.format(p), query_plan, False, project_fn_c)), range(0, settings.table_C_parts)) scan_c_detail_on_c_pk = \ map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " {},{} " "from " " S3Object " "where " " {} " " {} " .format(settings.table_C_primary_key, settings.table_C_detail_field_name, settings.table_C_filter_sql, get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=False)), settings.table_C_primary_key, settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_c_detail_on_c_pk' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_c_detail = OrderedDict([ ('_0', settings.table_C_primary_key), ('_1', settings.table_C_detail_field_name) ]) def project_fn_c_detail(df): df.rename(columns=field_names_map_c_detail, inplace=True) return df project_c_detail = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_c_detail.iteritems() ], 'project_c_detail' + '_{}'.format(p), query_plan, False, project_fn_c_detail)), range(0, settings.table_C_parts)) map_bc_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_1 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) map_c_pk_2 = map( lambda p: query_plan.add_operator( Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_c_pk = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_C_primary_key, 'bloom_create_bc_b_to_c_join_key_{}'.format(p), query_plan, False, fp_rate=settings.fp_rate)), range(0, settings.table_C_parts)) join_build_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_B_BC_join_key, 'join_build_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_ab_and_c_on_bc_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_build_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_C_primary_key, 'join_build_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_abc_and_c_on_c_pk = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_C_primary_key, settings. table_C_primary_key), 'join_probe_abc_and_c_on_c_pk' + '_{}'.format( p), query_plan, False)), range(0, settings.table_C_parts)) bloom_create_bc_join_key = map( lambda p: query_plan.add_operator( BloomCreate(settings.table_B_BC_join_key, 'bloom_create_bc_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) map_bc_c_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_b = OrderedDict( zip([ '_{}'.format(i) for i, name in enumerate(settings.table_B_field_names) ], settings.table_B_field_names)) def project_fn_b(df): df.rename(columns=field_names_map_b, inplace=True) return df project_b = map( lambda p: query_plan.add_operator( Project([ ProjectExpression(k, v) for k, v in field_names_map_b.iteritems() ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b) ), range(0, settings.table_B_parts)) map_ab_a_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_A_parts)) map_ab_b_join_key = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinBuild( settings.table_A_AB_join_key, 'join_build_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_a_and_b_on_ab_join_key = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_a_and_b_on_ab_join_key' + '_{}'.format( p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_B_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype( np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator( Project([ProjectExpression(lambda t: t['_0'], 'total_balance')], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_a) map(lambda o: o.set_async(False), project_b) map(lambda o: o.set_async(False), map_ab_a_join_key) map(lambda o: o.set_async(False), map_ab_b_join_key) if settings.table_C_key is None: map(lambda o: o.set_async(False), map_b_pk_1) map(lambda o: o.set_async(False), map_b_pk_2) map(lambda o: o.set_async(False), project_b_detail) else: map(lambda o: o.set_async(False), map_bc_b_join_key) map(lambda o: o.set_async(False), map_bc_c_join_key) map(lambda o: o.set_async(False), map_c_pk_1) map(lambda o: o.set_async(False), map_c_pk_2) map(lambda o: o.set_async(False), project_c) map(lambda o: o.set_async(False), project_c_detail) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_a, project_a) connect_many_to_many(project_a, map_ab_a_join_key) connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key) connect_all_to_all(project_a, bloom_create_ab_join_key) # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key) connect_many_to_many(join_build_a_and_b_on_ab_join_key, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key) connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key) connect_many_to_many(scan_b_on_ab_join_key, project_b) # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key) # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key) connect_many_to_many(project_b, map_ab_b_join_key) connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B) if settings.table_C_key is None: # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate) connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail) connect_many_to_many(project_b_detail, map_b_pk_2) connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk) connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_b_pk) connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate) connect_many_to_many(join_build_ab_and_b_on_b_pk, join_probe_ab_and_b_on_b_pk) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1) connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk) else: connect_all_to_all(join_probe_a_and_b_on_ab_join_key, bloom_create_bc_join_key) connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key) connect_many_to_many(scan_c_on_bc_join_key, project_c) # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_bc_c_join_key, join_probe_ab_and_c_on_bc_join_key) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key) connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bc_b_join_key) connect_all_to_all(map_bc_b_join_key, join_build_ab_and_c_on_bc_join_key) connect_all_to_all(join_probe_ab_and_c_on_bc_join_key, bloom_create_c_pk) # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1) connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk) connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk) # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk) connect_many_to_many(join_build_abc_and_c_on_c_pk, join_probe_abc_and_c_on_c_pk) # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C) # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk) connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail) # connect_many_to_many(project_c_detail, map_C_to_C) # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk) connect_many_to_many(project_c_detail, map_c_pk_2) connect_many_to_many(project_c, map_bc_c_join_key) connect_many_to_many(join_build_ab_and_c_on_bc_join_key, join_probe_ab_and_c_on_bc_join_key) connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk) connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def query_plan(settings): # type: (SyntheticBaselineJoinSettings) -> QueryPlan """ :type settings: :return: None """ query_plan = QueryPlan(is_async=settings.parallel, buffer_size=settings.buffer_size) def scan_A_fun(df): df.columns = settings.table_A_field_names criterion = settings.table_A_filter_fn(df) df = df[criterion] return df # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False, fn=scan_A_fun)), range(0, settings.table_A_parts)) def scan_B_fun(df): df.columns = settings.table_B_field_names criterion = settings.table_B_filter_fn(df) return df[criterion] scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False, fn=scan_B_fun)), range(0, settings.table_B_parts)) """ field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, True, project_fn_B)), range(0, settings.table_B_parts)) filter_b = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) """ def scan_C_fun(df): df.columns = settings.table_C_field_names criterion = settings.table_C_filter_fn(df) return df[criterion] scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False, fn=scan_C_fun)), range(0, settings.table_C_parts)) """ field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df.rename(columns=field_names_map_C, inplace=True) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, True, project_fn_C)), range(0, settings.table_C_parts)) filter_c = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) """ map_A_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map( lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_B_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map( lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_A_B = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'. format(p), query_plan, False)), range(0, settings.table_B_parts)) join_probe_A_B = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_A_AB_join_key, settings. table_B_AB_join_key), 'join_probe_A_B_{}' .format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_AB_C = map( lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'. format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map( lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings. table_C_BC_join_key), 'join_probe_AB_C_{}' .format(p), query_plan, False)), range(0, settings.table_C_parts)) def agg_fun(df): return pd.DataFrame({ 'sum': [df[settings.table_C_detail_field_name].astype(float).sum()] }) #return pd.DataFrame( { 'sum' : [ len(df) ] } ) part_aggregate = map( lambda p: query_plan.add_operator( Aggregate([ AggregateExpression( AggregateExpression.SUM, lambda t: float(t[ settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, agg_fun)), range(0, settings.table_C_parts)) def agg_reduce_fun(df): return pd.DataFrame({'sum': [df['sum'].sum()]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, agg_reduce_fun)) """ aggregate_project = query_plan.add_operator(Project( [ ProjectExpression(lambda t: t['_0'], 'total_balance') ], 'aggregate_project', query_plan, False)) """ collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Connect the operators connect_many_to_many(scan_A, map_A_to_B) #connect_many_to_many(project_A, filter_A) #connect_many_to_many(filter_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, map_B_to_B) #connect_many_to_many(project_B, filter_b) #connect_many_to_many(filter_b, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) #connect_many_to_many(join_probe_A_B, part_aggregate) #connect_many_to_one(part_aggregate, aggregate_reduce) #connect_one_to_one(aggregate_reduce, collate) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(scan_C, map_C_to_C) #connect_many_to_many(project_C, filter_c) #connect_many_to_many(filter_c, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, collate) #connect_one_to_one(aggregate_project, collate) return query_plan
def test_join_baseline_pandas(): """Tests a join :return: None """ query_plan = QueryPlan(is_async=True, buffer_size=0) # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('region.csv', 'select * from S3Object;', True, False, False, 'supplier_scan', query_plan, True)) def supplier_project_fn(df): df = df.filter(['_0'], axis='columns') df = df.rename(columns={'_0': 'r_regionkey'}) return df supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'r_regionkey')], 'supplier_project', query_plan, True, supplier_project_fn)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object;', True, False, False, 'nation_scan', query_plan, True)) def nation_project_fn(df): df = df.filter(['_2'], axis='columns') df = df.rename(columns={'_2': 'n_regionkey'}) return df nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_2'], 'n_regionkey')], 'nation_project', query_plan, True, nation_project_fn)) supplier_nation_join_build = query_plan.add_operator( HashJoinBuild('n_regionkey', 'supplier_nation_join_build', query_plan, True)) supplier_nation_join_probe = query_plan.add_operator( HashJoinProbe(JoinExpression('n_regionkey', 'r_regionkey'), 'supplier_nation_join_probe', query_plan, True)) collate = query_plan.add_operator(Collate('collate', query_plan, True)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) nation_project.connect(supplier_nation_join_build) supplier_nation_join_probe.connect_build_producer(supplier_nation_join_build) supplier_nation_join_probe.connect_tuple_producer(supplier_project) supplier_nation_join_probe.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['n_regionkey', 'r_regionkey'] assert len(tuples) == 25 + 1 assert tuples[0] == field_names num_rows = 0 for t in tuples: num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['n_regionkey'] == lt['r_regionkey']
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate, expected_result, format_): """ :return: None """ print('') print("TPCH Q17 Bloom Join") print("-------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Query plan part_scan = map( lambda p: query_plan.add_operator( tpch_q17.sql_scan_select_partkey_where_brand_container_op( part_sharded, p, part_parts, use_pandas, secure, use_native, 'part_scan' + '_' + str(p), query_plan, sf, format_)), range(0, part_parts)) part_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_op('part_project' + '_' + str(p), query_plan)), range(0, part_parts)) part_lineitem_join_build_map = map( lambda p: query_plan.add_operator( Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan, False)), range(0, part_parts)) part_bloom_create = query_plan.add_operator( tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create', query_plan)) lineitem_bloom_use = \ map(lambda p: query_plan.add_operator( tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op( lineitem_sharded, p, part_parts, use_pandas, secure, use_native, 'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)), range(0, lineitem_parts)) lineitem_project = map( lambda p: query_plan.add_operator( tpch_q17. project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op( 'lineitem_project' + '_' + str(p), query_plan)), range(0, lineitem_parts)) part_lineitem_join_probe_map = map( lambda p: query_plan.add_operator( Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) part_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) part_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'part_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_part_avg_group = map( lambda p: query_plan.add_operator( tpch_q17.group_partkey_avg_quantity_op( 'lineitem_part_avg_group' + '_' + str(p), query_plan)), range(0, other_parts)) lineitem_part_avg_group_project = map( lambda p: query_plan.add_operator( tpch_q17.project_partkey_avg_quantity_op( 'lineitem_part_avg_group_project' + '_' + str(p), query_plan)), range(0, other_parts)) part_lineitem_join_avg_group_join_build = \ map(lambda p: query_plan.add_operator( HashJoinBuild('l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) part_lineitem_join_avg_group_join_probe = \ map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'), 'part_lineitem_join_avg_group_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) lineitem_filter = map( lambda p: query_plan.add_operator( tpch_q17.filter_lineitem_quantity_op( 'lineitem_filter' + '_' + str(p), query_plan)), range(0, other_parts)) extendedprice_sum_aggregate = map( lambda p: query_plan.add_operator( tpch_q17.aggregate_sum_extendedprice_op( use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p), query_plan)), range(0, other_parts)) def aggregate_reduce_fn(df): sum1_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum1_]}) aggregate_reduce = query_plan.add_operator( Aggregate([ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) extendedprice_sum_aggregate_project = query_plan.add_operator( tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan)) collate = query_plan.add_operator( tpch_q17.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), part_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), part_lineitem_join_probe_map) map(lambda o: o.set_async(False), part_lineitem_join_build_map) map(lambda o: o.set_async(False), lineitem_part_avg_group) map(lambda o: o.set_async(False), lineitem_part_avg_group_project) map(lambda o: o.set_async(False), extendedprice_sum_aggregate) extendedprice_sum_aggregate_project.set_async(False) # Connect the operators # part_scan.connect(part_project) map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan)) # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project)) map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project)) connect_many_to_one(part_project, part_bloom_create) connect_one_to_many(part_bloom_create, lineitem_bloom_use) # part_project.connect(part_bloom_create) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)), # enumerate(part_bloom_create_map)) # part_bloom_create.connect(lineitem_bloom_use) # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)), # enumerate(part_bloom_create)) # lineitem_bloom_use.connect(lineitem_project) map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use)) # part_lineitem_join.connect_left_producer(part_project) map( lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_lineitem_join_build_map)) map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build)) # part_lineitem_join.connect_right_producer(lineitem_project) # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project)) connect_many_to_many(lineitem_project, part_lineitem_join_probe_map) map( lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)), enumerate(part_lineitem_join_probe_map)) # part_lineitem_join.connect(lineitem_part_avg_group) map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe)) # lineitem_part_avg_group.connect(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group)) # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project) map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]), enumerate(lineitem_part_avg_group_project)) # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_build_producer(o), enumerate(part_lineitem_join_avg_group_join_build)) map( lambda (p, o): part_lineitem_join_avg_group_join_probe[p]. connect_tuple_producer(o), enumerate(part_lineitem_join_probe)) # part_lineitem_join_avg_group_join.connect(lineitem_filter) map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe)) # lineitem_filter.connect(extendedprice_sum_aggregate) map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter)) # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project) map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate)) aggregate_reduce.connect(extendedprice_sum_aggregate_project) # extendedprice_sum_aggregate_project.connect(collate) extendedprice_sum_aggregate_project.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("lineitem parts: {}".format(lineitem_parts)) print("part_parts: {}".format(part_parts)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("part_sharded: {}".format(part_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['avg_yearly'] assert len(tuples) == 1 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL if s3filter.util.constants.TPCH_SF == 10: assert round( float(tuples[1][0]), 10 ) == 372414.2899999995 # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet elif s3filter.util.constants.TPCH_SF == 1: numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
def test_r_to_l_join(): """Tests a join :return: None """ query_plan = QueryPlan() # Query plan supplier_scan = query_plan.add_operator( SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'supplier_scan', query_plan, False)) supplier_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_3'], 's_nationkey')], 'supplier_project', query_plan, False)) nation_scan = query_plan.add_operator( SQLTableScan('nation.csv', 'select * from S3Object;', False, 'nation_scan', query_plan, False)) nation_project = query_plan.add_operator( Project([ProjectExpression(lambda t_: t_['_0'], 'n_nationkey')], 'nation_project', query_plan, False)) supplier_nation_join = query_plan.add_operator( HashJoin(JoinExpression('n_nationkey', 's_nationkey'), 'supplier_nation_join', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) supplier_scan.connect(supplier_project) nation_scan.connect(nation_project) supplier_nation_join.connect_left_producer(nation_project) supplier_nation_join.connect_right_producer(supplier_project) supplier_nation_join.connect(collate) # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results # num_rows = 0 # for t in collate.tuples(): # num_rows += 1 # print("{}:{}".format(num_rows, t)) # collate.print_tuples() field_names = ['n_nationkey', 's_nationkey'] assert len(collate.tuples()) == 10000 + 1 assert collate.tuples()[0] == field_names num_rows = 0 for t in collate.tuples(): num_rows += 1 # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key if num_rows > 1: lt = IndexedTuple.build(t, field_names) assert lt['s_nationkey'] == lt['n_nationkey'] # Write the metrics query_plan.print_metrics()