def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, format_, shards_path): secure = False use_native = False print('') print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format( sort_field, sort_order, k)) print("----------------------") stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p ), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK sort_expr = [SortExpression(sort_field, float, sort_order)] topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() query_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ 0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_first_part, table_parts, queried_columns, select_columns, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling sample_scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format( path, p), 'select {} from S3Object;'.format( sort_field), format_, use_pandas, secure, use_native, 'column_scan_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Sampling project def project_fn1(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn1)), range(table_first_part, table_first_part + table_parts)) # TopK samples sort_expr = SortExpression(sort_field, float, sort_order) sample_topk = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild( sort_order, 'float', 'select {} from S3object '.format(select_columns), #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False )) ' {} '.format(sort_field), 'sql_gen', query_plan, False)) # Scan scan = map( lambda p: query_plan.add_operator( #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p), SQLTableScan( "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # Project def project_fn2(df): df.columns = queried_columns df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn2)), range(table_first_part, table_first_part + table_parts)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_first_part, table_first_part + table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/groupby/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project)) sample_topk.connect(sql_gen) map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def run_local_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") stats += [ 'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scan = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda (i, op): sample_scan[i].set_nthreads(n_threads), enumerate(sample_scan)) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda o: tbl_smpler.connect(o), sample_scan) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts_end - table_parts_start)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_head_table_sampling(stats, sort_field_index, sort_field, k, sample_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): secure = False use_native = False print('') print( "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}" .format(sort_field, sort_order, k, sample_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, 1 ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling table_parts = table_parts_end - table_parts_start + 1 per_part_samples = int(sample_size / table_parts) table_name = os.path.basename(tbl_s3key) sample_scan = map( lambda p: query_plan.add_operator( SQLTableScan( "{}.{}".format(shards_path, p), 'select {} from S3Object limit {};'.format( sort_field, per_part_samples), format_, use_pandas, secure, use_native, 'sample_scan_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Sampling project def project_fn(df): df.columns = [sort_field] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda op: op.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def run_memory_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print( "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}" .format(sort_field, sort_order, k, sample_size, batch_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scanners = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda op: op.set_nthreads(n_threads), sample_scanners) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sample_sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda op: tbl_smpler.connect(op), sample_scanners) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scanners)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down # query_plan.stop() sampling_threshold = query_plan.retrieve_sampling_threshold() sampling_runtime = query_plan.get_phase_runtime('sampl') sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost( 'sampl') sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost( phase_keyword='sampl') sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost( 'sampl') total_runtime = query_plan.total_elapsed_time total_http_requests, total_requests_cost = query_plan.requests_cost() total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost( ) total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost() total_data_cost = query_plan.data_cost()[0] total_computation_cost = query_plan.computation_cost() total_cost = query_plan.cost()[0] stats += [ sampling_threshold, sampling_runtime, total_runtime - sampling_runtime, total_runtime, sampling_returned_rows, sampling_scanned_bytes * BYTE_TO_MB, sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests, sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost, total_returned_rows, total_scanned_bytes * BYTE_TO_MB, total_returned_bytes * BYTE_TO_MB, total_http_requests, total_requests_cost, total_transfer_cost, total_scan_cost, total_data_cost, total_computation_cost, total_cost ]
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts, customer_sharded, order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None, order_filter_sql=None, lineitem_filter_sql=None): """ :return: None """ print('') print("TPCH Q3 Bloom Join") print("------------------") query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) customer_scan = map(lambda p: query_plan.add_operator( SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_), "select " " c_custkey " "from " " S3Object " "where " " c_mktsegment = 'BUILDING' " " {} " " {} " .format( ' and ' + customer_filter_sql if customer_filter_sql is not None else '', get_sql_suffix('customer', customer_parts, p, customer_sharded, add_where=False)), format_, use_pandas, secure, use_native, 'customer_scan' + '_{}'.format(p), query_plan, False)), range(0, customer_parts)) def customer_project_fn(df): df = df.filter(items=['_0'], axis=1) df.rename(columns={'_0': 'c_custkey'}, inplace=True) return df customer_project = map(lambda p: query_plan.add_operator( Project([], 'customer_project' + '_{}'.format(p), query_plan, False, customer_project_fn)), range(0, customer_parts)) customer_bloom_create = query_plan.add_operator( BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False, fp_rate)) customer_map = map(lambda p: query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)), range(0, customer_parts)) order_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_), "select " " o_custkey, o_orderkey, o_orderdate, o_shippriority " "from " " S3Object " "where " " cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + order_filter_sql if order_filter_sql is not None else '', get_sql_suffix('orders', order_parts, p, order_sharded, add_where=False)), 'o_custkey', format_, use_pandas, secure, use_native, 'order_scan' + '_{}'.format(p), query_plan, False)), range(0, order_parts)) def order_project_fn(df): df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1) df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'}, inplace=True) return df order_project = map(lambda p: query_plan.add_operator( Project([], 'order_project' + '_{}'.format(p), query_plan, False, order_project_fn)), range(0, customer_parts)) order_map_1 = map(lambda p: query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)), range(0, order_parts)) customer_order_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'), 'customer_order_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) order_bloom_create = query_plan.add_operator( BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False, fp_rate)) lineitem_scan = map(lambda p: query_plan.add_operator( SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_), "select " " l_orderkey, l_extendedprice, l_discount " "from " " S3Object " "where " " cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) " " {} " " {} " .format( ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '', get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded, add_where=False)), 'l_orderkey', format_, use_pandas, secure, use_native, 'lineitem_scan' + '_{}'.format(p), query_plan, False)), range(0, lineitem_parts)) def lineitem_project_fn(df): df = df.filter(items=['_0', '_1', '_2'], axis=1) df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'}, inplace=True) return df lineitem_project = map(lambda p: query_plan.add_operator( Project([], 'lineitem_project' + '_{}'.format(p), query_plan, False, lineitem_project_fn)), range(0, lineitem_parts)) lineitem_map = map(lambda p: query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)), range(0, lineitem_parts)) order_map_2 = map(lambda p: query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_build = map(lambda p: query_plan.add_operator( HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' + '_' + str(p), query_plan, False)), range(0, other_parts)) customer_order_lineitem_join_probe = map(lambda p: query_plan.add_operator( HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'), 'customer_order_lineitem_join_probe' + '_' + str(p), query_plan, False)), range(0, other_parts)) def groupby_fn(df): df['l_extendedprice'] = df['l_extendedprice'].astype(np.float) df['l_discount'] = df['l_discount'].astype(np.float) df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount']) grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group = map(lambda p: query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group' + '_{}'.format(p), query_plan, False, groupby_fn)), range(0, other_parts)) def group_reduce_fn(df): grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority']) agg_df = grouped['revenue'].sum() return agg_df.reset_index() group_reduce = query_plan.add_operator( Group( ['l_orderkey', 'o_orderdate', 'o_shippriority'], # l_partkey [ AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount']))) ], 'group_reduce', query_plan, False, group_reduce_fn)) top = query_plan.add_operator( Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas, 'top', query_plan, False)) collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan)) # Inline what we can map(lambda o: o.set_async(False), lineitem_project) map(lambda o: o.set_async(False), customer_project) map(lambda o: o.set_async(False), order_project) map(lambda o: o.set_async(False), lineitem_map) map(lambda o: o.set_async(False), customer_map) map(lambda o: o.set_async(False), order_map_1) map(lambda o: o.set_async(False), order_map_2) # Connect the operators connect_many_to_many(customer_scan, customer_project) connect_many_to_many(customer_project, customer_map) connect_many_to_one(customer_project, customer_bloom_create) connect_one_to_many(customer_bloom_create, order_scan) connect_many_to_many(order_scan, order_project) connect_many_to_many(order_project, order_map_1) connect_all_to_all(customer_map, customer_order_join_build) connect_many_to_many(customer_order_join_build, customer_order_join_probe) connect_all_to_all(order_map_1, customer_order_join_probe) # connect_many_to_one(customer_order_join_probe, collate) connect_many_to_one(order_project, order_bloom_create) connect_one_to_many(order_bloom_create, lineitem_scan) connect_many_to_many(lineitem_scan, lineitem_project) connect_many_to_many(lineitem_project, lineitem_map) connect_many_to_many(customer_order_join_probe, order_map_2) connect_all_to_all(order_map_2, customer_order_lineitem_join_build) connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe) connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe) # connect_many_to_one(customer_order_lineitem_join_probe, collate) connect_many_to_many(customer_order_lineitem_join_probe, group) # connect_many_to_one(group, collate) connect_many_to_one(group, group_reduce) connect_one_to_one(group_reduce, top) connect_one_to_one(top, collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print('secure: {}'.format(secure)) print('use_native: {}'.format(use_native)) print("customer_parts: {}".format(customer_parts)) print("order_parts: {}".format(order_parts)) print("lineitem_parts: {}".format(lineitem_parts)) print("customer_sharded: {}".format(customer_sharded)) print("order_sharded: {}".format(order_sharded)) print("lineitem_sharded: {}".format(lineitem_sharded)) print("other_parts: {}".format(other_parts)) print("fp_rate: {}".format(fp_rate)) print("format: {}".format(format_)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id()) # Start the query query_plan.execute() tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue'] assert len(tuples) == 10 + 1 assert tuples[0] == field_names # NOTE: This result has been verified with the equivalent data and query on PostgreSQL test_util.assert_tuples(expected_result, tuples)
def topk_baseline(stats, k, sort_index='_5', col_type=float, col_name='l_extendedprice', sort_order='DESC', use_pandas=True, filtered=False, table_name='tpch-sf1/lineitem.csv', shards_prefix='tpch-sf1/lineitem_sharded', shards_start=0, shards_end=31): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ limit = k num_rows = 0 parallel_shards = True processes = multiprocessing.cpu_count() query_stats = [ 'baseline' if filtered is False else 'filtered', shards_prefix, col_name, sort_order, limit ] sql = 'select * from S3Object;' if filtered: sql = '''select l_extendedprice from S3Object;''' sort_index = '_0' print("\n\nBaseline TopK with order {} on field {}\n".format( sort_order, col_name)) query_plan = QueryPlan(is_async=True) # Query plan # ts = query_plan.add_operator( # SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False)) sort_exp = SortExpression(sort_index, col_type, sort_order) top_op = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk', query_plan, False)) for process in range(processes): proc_parts = [ x for x in range(shards_start, shards_end + 1) if x % processes == process ] pc = query_plan.add_operator( SQLShardedTableScan(table_name, sql, use_pandas, True, False, "topk_table_scan_parts_{}".format(proc_parts), proc_parts, shards_prefix, parallel_shards, query_plan, False)) # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts)) pc_top = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk_parts_{}'.format(proc_parts), query_plan, False)) pc.connect(pc_top) pc_top.connect(top_op) c = query_plan.add_operator(Collate('collate', query_plan, False)) top_op.connect(c) # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results for t in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) # assert num_rows == limit + 1 cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] query_stats += [ 0, 0, query_plan.total_elapsed_time, query_plan.total_elapsed_time, 0, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost, num_rows == limit + 1 ] stats.append(query_stats) # Write the metrics query_plan.print_metrics() query_plan.stop()
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ secure = False use_native = False print('') print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order)) print("----------------------") # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Scan scan = map(lambda p: query_plan.add_operator( SQLTableScan("{}/topk_data_{}.csv".format(path, p), "select * from S3Object;", format_, use_pandas, secure, use_native, 'scan_{}'.format(p), query_plan, False)), range(0, table_parts)) # Project def project_fn(df): df.columns = ['F0', 'F1', 'F2'] df[ [sort_field] ] = df[ [sort_field] ].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map(lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(0, table_parts)) # TopK sort_expr = SortExpression(sort_field, 'float', sort_order) topk = map(lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(0, table_parts)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator( Collate('collate', query_plan, False)) #profile_path = '../benchmark-output/topk/' #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof")) #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof")) #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof")) #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof")) #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof")) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts)) print('') # Write the plan graph query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') tuples = collate.tuples() collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop()
def __test_topk_baseline(sort_index='_5', col_type=float, col_name='l_quantity', sort_order='DESC', use_pandas=True, filtered=False): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ limit = 50 num_rows = 0 shards = 32 parallel_shards = True shards_prefix = "sf1000-lineitem" processes = multiprocessing.cpu_count() sql = 'select * from S3Object;' if filtered: sql = '''select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_comment from S3Object;''' print("\n\nBaseline TopK with order {} on field {}\n".format( sort_order, col_name)) query_plan = QueryPlan(is_async=True) # Query plan # ts = query_plan.add_operator( # SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False)) sort_exp = SortExpression(sort_index, col_type, sort_order) top_op = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk', query_plan, False)) for process in range(processes): proc_parts = [x for x in range(shards) if x % processes == process] pc = query_plan.add_operator( SQLShardedTableScan("lineitem.csv", sql, use_pandas, True, "topk_table_scan_parts_{}".format(proc_parts), proc_parts, shards_prefix, parallel_shards, query_plan, False)) # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts)) pc_top = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk_parts_{}'.format(proc_parts), query_plan, False)) pc.connect(pc_top) pc_top.connect(top_op) c = query_plan.add_operator(Collate('collate', query_plan, False)) top_op.connect(c) # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results for t in c.tuples(): num_rows += 1 print("{}:{}".format(num_rows, t)) assert num_rows == limit + 1 # Write the metrics query_plan.print_metrics() query_plan.stop()