def lineitem_filter_def(name, query_plan): def pd_expr(df): return ((df['l_quantity'].astype(np.int) >= 3) & (df['l_quantity'].astype(np.int) <= 3 + 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | ( (df['l_quantity'].astype(np.int) >= 16) & (df['l_quantity'].astype(np.int) <= 16 + 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | ( (df['l_quantity'].astype(np.int) >= 24) & (df['l_quantity'].astype(np.int) <= 24 + 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON')) return Filter( PredicateExpression( lambda t_: (3 <= int(t_['l_quantity']) <= 3 + 10 and t_['l_shipmode'] in [ 'AIR', 'AIR REG' ] and t_['l_shipinstruct'] == 'DELIVER IN PERSON') or (16 <= int(t_['l_quantity']) <= 16 + 10 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ) or (24 <= int(t_['l_quantity']) <= 24 + 10 and t_[ 'l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON'), pd_expr), name, query_plan, False)
def part_filter_def(name, query_plan): def pd_expr(df): return ((df['p_brand'] == 'Brand#11') & (df['p_container'].isin( ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 5)) | ( (df['p_brand'] == 'Brand#44') & (df['p_container'].isin( ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 10)) | ( (df['p_brand'] == 'Brand#53') & (df['p_container'].isin( ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 15)) return Filter( PredicateExpression( lambda t_: (t_['p_brand'] == 'Brand#11' and t_['p_container'] in [ 'SM CASE', 'SM BOX', 'SM PACK', 'SM PKG' ] and 1 <= int(t_['p_size']) <= 5) or (t_['p_brand'] == 'Brand#44' and t_['p_container'] in [ 'MED BAG', 'MED BOX', 'MED PACK', 'MED PKG' ] and 1 <= int(t_['p_size']) <= 10) or (t_['p_brand'] == 'Brand#53' and t_['p_container'] in [ 'LG CASE', 'LG BOX', 'LG PACK', 'LG PKG' ] and 1 <= int(t_['p_size']) <= 15), pd_expr), name, query_plan, False)
def filter_def(name, query_plan): def pd_expr(df): return ( (df['p_brand'] == 'Brand#11') & (df['p_container'].isin(['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) & (df['l_quantity'].astype(np.int) >= 3) & (df['l_quantity'].astype(np.int) <= 3 + 10) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 5) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON') ) | ( (df['p_brand'] == 'Brand#44') & (df['p_container'].isin(['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) & (df['l_quantity'].astype(np.int) >= 16) & (df['l_quantity'].astype(np.int) <= 16 + 10) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 10) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON') ) | ( (df['p_brand'] == 'Brand#53') & (df['p_container'].isin(['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) & (df['l_quantity'].astype(np.int) >= 24) & (df['l_quantity'].astype(np.int) <= 24 + 10) & (df['p_size'].astype(np.int) >= 1) & (df['p_size'].astype(np.int) <= 15) & (df['l_shipmode'].isin(['AIR', 'AIR REG'])) & (df['l_shipinstruct'] == 'DELIVER IN PERSON') ) return Filter( PredicateExpression(lambda t_: ( t_['p_brand'] == 'Brand#11' and t_['p_container'] in ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'] and 3 <= int(t_['l_quantity']) <= 3 + 10 and 1 <= int(t_['p_size']) <= 5 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ) or ( t_['p_brand'] == 'Brand#44' and t_['p_container'] in ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'] and 16 <= int(t_['l_quantity']) <= 16 + 10 and 1 <= int(t_['p_size']) <= 10 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ) or ( t_['p_brand'] == 'Brand#53' and t_['p_container'] in ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'] and 24 <= int(t_['l_quantity']) <= 24 + 10 and 1 <= int(t_['p_size']) <= 15 and t_['l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON' ), pd_expr), name, query_plan, False)
def filter_brand_container_op(name, query_plan): def pd_expr(df): return (df['p_brand'] == 'Brand#41') & (df['p_container'] == 'SM PACK') return Filter( PredicateExpression( lambda t_: t_['p_brand'] == 'Brand#41' and t_['p_container'] == 'SM PACK', pd_expr), name, query_plan, False)
def filter_brand12_operator_def(name, query_plan): # type: (str, QueryPlan) -> Filter def pd_expr(df): return df['p_brand'] == 'Brand#12' return Filter( PredicateExpression(lambda t_: t_['p_brand'] == 'Brand#12', pd_expr), name, query_plan, False)
def filter_shipdate_operator_def(max_shipped_date, name, query_plan): def pd_expr(df): return pd.to_datetime(df['l_shipdate'], cache=True) <= max_shipped_date return Filter( PredicateExpression(lambda t_: (cast(t_['l_shipdate'], timestamp) <= cast(max_shipped_date, timestamp)), pd_expr), name, query_plan, False)
def filter_shipdate_operator_def(min_shipped_date, max_shipped_date, name, query_plan): def pd_expr(df): # df['_10'] = pd.to_datetime(df['_10']) return (pd.to_datetime(df['l_shipdate']) >= min_shipped_date) & ( pd.to_datetime(df['l_shipdate']) < max_shipped_date) return Filter( PredicateExpression( lambda t_: (cast(t_['l_shipdate'], timestamp) >= cast( min_shipped_date, timestamp)) and (cast(t_['l_shipdate'], timestamp) < cast( max_shipped_date, timestamp)), pd_expr), name, query_plan, False)
def filter_lineitem_quantity_op(name, query_plan): """with filter_join_2 as (select * from part_lineitem_join_avg_group_join where l_quantity < avg_l_quantity_computed00) :param query_plan: :param name: :return: """ def pd_expr(df): return (df['l_quantity'].astype(np.float) < df['avg_l_quantity_computed00']) return Filter( PredicateExpression( lambda t_: float(t_['l_quantity']) < t_['avg_l_quantity_computed00' ], pd_expr), name, query_plan, False, )
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas, buffer_size, table_parts, lower, upper, sharded, sf, use_shared_mem, inline_ops, merge_ops): secure = False use_native = False if use_shared_mem: system = WorkerSystem(shared_mem_buffer_size) else: system = None query_plan = QueryPlan(system, is_async=parallel, buffer_size=buffer_size, use_shared_mem=use_shared_mem) # scan the file scan = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) def filter_fn(df): return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype( np.float) <= upper) filter_ = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p), query_plan, False)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda p: p.set_async(not inline_ops), filter_) connect_many_to_many(scan, filter_) connect_many_to_one(filter_, aggregate) connect_one_to_one(aggregate, collate) profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops, use_shared_mem) scan[0].set_profiled( True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + profile_file_suffix + ".prof")) filter_[0].set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_filter_0" + profile_file_suffix + ".prof")) aggregate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + profile_file_suffix + ".prof")) collate.set_profiled( True, os.path.join( ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + profile_file_suffix + ".prof")) return query_plan
def query_plan(settings): # type: (SyntheticBaselineJoinSettings) -> QueryPlan """ :type settings: :return: None """ if settings.use_shared_mem: system = WorkerSystem(settings.shared_memory_size) else: system = None query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size, use_shared_mem=settings.use_shared_mem) # Define the operators scan_A = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_A_key, settings.table_A_parts, p, settings.table_A_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) field_names_map_A = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names)) def project_fn_A(df): df = df.rename(columns=field_names_map_A, copy=False) return df project_A = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()], 'project_A_{}'.format(p), query_plan, False, project_fn_A)), range(0, settings.table_A_parts)) if settings.table_A_filter_fn is not None: filter_A = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_A_filter_fn), 'filter_A_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) scan_B = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_B_key, settings.table_B_parts, p, settings.table_B_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) field_names_map_B = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names)) def project_fn_B(df): df.rename(columns=field_names_map_B, inplace=True) return df project_B = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()], 'project_B_{}'.format(p), query_plan, False, project_fn_B)), range(0, settings.table_B_parts)) if settings.table_B_filter_fn is not None: filter_b = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) if settings.table_C_key is not None: scan_C = \ map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf), "select " " * " "from " " S3Object " "{}" .format( get_sql_suffix(settings.table_C_key, settings.table_C_parts, p, settings.table_C_sharded, add_where=True)), settings.format_, settings.use_pandas, settings.secure, settings.use_native, 'scan_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) field_names_map_C = OrderedDict( zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names)) def project_fn_C(df): df = df.rename(columns=field_names_map_C, copy=False) return df project_C = map(lambda p: query_plan.add_operator(Project( [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()], 'project_C_{}'.format(p), query_plan, False, project_fn_C)), range(0, settings.table_C_parts)) filter_c = map(lambda p: query_plan.add_operator(Filter( PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) map_B_to_C = map(lambda p: query_plan.add_operator( Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) map_C_to_C = map(lambda p: query_plan.add_operator( Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_build_AB_C = map(lambda p: query_plan.add_operator( HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) join_probe_AB_C = map(lambda p: query_plan.add_operator( HashJoinProbe( JoinExpression(settings.table_B_BC_join_key, settings.table_C_BC_join_key), 'join_probe_AB_C_{}'.format(p), query_plan, False)), range(0, settings.table_C_parts)) map_A_to_B = map(lambda p: query_plan.add_operator( Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_A_parts)) map_B_to_B = map(lambda p: query_plan.add_operator( Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)), range(0, settings.table_B_parts)) join_build_A_B = map(lambda p: query_plan.add_operator( HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.format(p), query_plan, False)), range(0, settings.other_parts)) join_probe_A_B = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression(settings.table_A_AB_join_key, settings.table_B_AB_join_key), 'join_probe_A_B_{}'.format(p), query_plan, False)), range(0, settings.other_parts)) if settings.table_C_key is None: def part_aggregate_fn(df): sum_ = df[settings.table_B_detail_field_name].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map(lambda p: query_plan.add_operator(Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t[settings.table_B_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.other_parts)) else: def part_aggregate_fn(df): sum_ = df[settings.table_C_detail_field_name].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) part_aggregate = map(lambda p: query_plan.add_operator(Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t[settings.table_C_detail_field_name])) ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)), range(0, settings.table_C_parts)) def aggregate_reduce_fn(df): sum_ = df['_0'].astype(np.float).sum() return pd.DataFrame({'_0': [sum_]}) aggregate_reduce = query_plan.add_operator(Aggregate( [ AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])) ], settings.use_pandas, 'aggregate_reduce', query_plan, False, aggregate_reduce_fn)) aggregate_project = query_plan.add_operator(Project( [ ProjectExpression(lambda t: t['_0'], 'total_balance') ], 'aggregate_project', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) # Inline some of the operators map(lambda o: o.set_async(False), project_A) if settings.table_A_filter_fn is not None: map(lambda o: o.set_async(False), filter_A) map(lambda o: o.set_async(False), project_B) if settings.table_B_filter_fn is not None: map(lambda o: o.set_async(False), filter_b) map(lambda o: o.set_async(False), map_A_to_B) map(lambda o: o.set_async(False), map_B_to_B) if settings.table_C_key is not None: map(lambda o: o.set_async(False), map_B_to_C) map(lambda o: o.set_async(False), map_C_to_C) map(lambda o: o.set_async(False), project_C) map(lambda o: o.set_async(False), filter_c) map(lambda o: o.set_async(False), part_aggregate) aggregate_project.set_async(False) # Connect the operators connect_many_to_many(scan_A, project_A) if settings.table_A_filter_fn is not None: connect_many_to_many(project_A, filter_A) connect_many_to_many(filter_A, map_A_to_B) else: connect_many_to_many(project_A, map_A_to_B) connect_all_to_all(map_A_to_B, join_build_A_B) connect_many_to_many(join_build_A_B, join_probe_A_B) connect_many_to_many(scan_B, project_B) if settings.table_B_filter_fn is not None: connect_many_to_many(project_B, filter_b) connect_many_to_many(filter_b, map_B_to_B) else: connect_many_to_many(project_B, map_B_to_B) connect_all_to_all(map_B_to_B, join_probe_A_B) if settings.table_C_key is None: connect_many_to_many(join_probe_A_B, part_aggregate) else: connect_many_to_many(join_probe_A_B, map_B_to_C) connect_all_to_all(map_B_to_C, join_build_AB_C) connect_many_to_many(join_build_AB_C, join_probe_AB_C) connect_many_to_many(scan_C, project_C) connect_many_to_many(project_C, filter_c) connect_many_to_many(filter_c, map_C_to_C) connect_all_to_all(map_C_to_C, join_probe_AB_C) connect_many_to_many(join_probe_AB_C, part_aggregate) connect_many_to_one(part_aggregate, aggregate_reduce) connect_one_to_one(aggregate_reduce, aggregate_project) connect_one_to_one(aggregate_project, collate) return query_plan
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Baseline Join") print("---------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('customer', customer_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=customer_filter_sql is None) ), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0', '_6'], axis=1) df.rename(columns={ '_0': 'c_custkey', '_6': 'c_mktsegment' }, inplace=True) return df customer_project = map( lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) def customer_filter_fn(df): return df['c_mktsegment'] == 'BUILDING' customer_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=customer_filter_fn), 'customer_filter' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) customer_map = map( lambda p: query_plan.add_operator( Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False) ), range(0, customer_parts)) order_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('orders', order_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=order_filter_sql is None) ), format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_4', '_7'], axis=1) df.rename(columns={ '_1': 'o_custkey', '_0': 'o_orderkey', '_4': 'o_orderdate', '_7': 'o_shippriority' }, inplace=True) return df order_project = map( lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) def order_filter_fn(df): return pd.to_datetime(df['o_orderdate']) < pd.Timestamp( datetime.strptime('1995-03-01', '%Y-%m-%d').date()) order_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=order_filter_fn), 'order_filter' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) order_map_1 = map( lambda p: query_plan.add_operator( Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str( p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) lineitem_scan = map( lambda p: query_plan.add_operator( SQLTableScan( get_file_key('lineitem', lineitem_sharded, p, sf, format_ ), "select " " * " "from " " S3Object " " {} " " {} ".format( ' where ' + lineitem_filter_sql if lineitem_filter_sql is not None else "", get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=lineitem_filter_sql is None) ), format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_5', '_6', '_10'], axis=1) df.rename(columns={ '_0': 'l_orderkey', '_5': 'l_extendedprice', '_6': 'l_discount', '_10': 'l_shipdate' }, inplace=True) return df lineitem_project = map( lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) def lineitem_filter_fn(df): return pd.to_datetime(df['l_shipdate']) > pd.Timestamp( datetime.strptime('1995-03-01', '%Y-%m-%d').date()) lineitem_filter = map( lambda p: query_plan.add_operator( Filter(PredicateExpression(pd_expr=lineitem_filter_fn), 'lineitem_filter' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) lineitem_map = map( lambda p: query_plan.add_operator( Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False) ), range(0, lineitem_parts)) order_map_2 = map( lambda p: query_plan.add_operator( Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False) ), range(0, other_parts)) customer_order_lineitem_join_build = map( lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map( lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str( p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map( lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_[ 'l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression( AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [ SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC') ], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator( tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_filter) map(lambda o: o.set_async(False), customer_filter) map(lambda o: o.set_async(False), order_filter) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_filter) connect_many_to_many(customer_filter, customer_map) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_filter) connect_many_to_many(order_filter, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_filter) connect_many_to_many(lineitem_filter, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf): secure = False use_native = False print('') print("Indexing Benchmark") print("------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # scan the file scan = map( lambda p: query_plan.add_operator( TableScan(get_file_key('lineitem', True, p, sf=sf), use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(0, table_parts)) # project def fn(df): df.columns = [ 'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment' ] df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float) return df project = map( lambda p: query_plan.add_operator( Project([], 'project_{}'.format(p), query_plan, False, fn)), range(0, table_parts)) # perform the filter locally def pd_expr(df): return (df['l_extendedprice'] >= lower) & (df['l_extendedprice'] <= upper) filters = map( lambda p: query_plan.add_operator( Filter( PredicateExpression( lambda t_: (t_['l_extendedprice'] >= lower) and (t_['l_extendedprice'] <= upper), pd_expr), 'filter_{}'. format(p), query_plan, False)), range(0, table_parts)) # aggregation def agg_fun(df): return pd.DataFrame({'count': [len(df)]}) aggregate = query_plan.add_operator( Aggregate([], True, 'agg', query_plan, False, agg_fun)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(filters[p]), enumerate(project)) map(lambda (p, o): o.connect(aggregate), enumerate(filters)) aggregate.connect(collate) # scan[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof")) # project[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_profile_0" + ".prof")) # filters[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_filters_0" + ".prof")) # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof")) # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof")) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()