def run_local_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format( sort_field, sort_order)) print("----------------------") stats += [ 'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scan = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda (i, op): sample_scan[i].set_nthreads(n_threads), enumerate(sample_scan)) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda o: tbl_smpler.connect(o), sample_scan) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Plan settings print('') print("Settings") print("--------") print('') print('use_pandas: {}'.format(use_pandas)) print("table parts: {}".format(table_parts_end - table_parts_start)) print('') # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts)) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down query_plan.stop() sampling_time = query_plan.total_elapsed_time cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] stats += [ sql_gen.threshold, sampling_time, 0, sampling_time, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost ]
def topk_baseline(stats, k, sort_index='_5', col_type=float, col_name='l_extendedprice', sort_order='DESC', use_pandas=True, filtered=False, table_name='tpch-sf1/lineitem.csv', shards_prefix='tpch-sf1/lineitem_sharded', shards_start=0, shards_end=31): """ Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap :return: """ limit = k num_rows = 0 parallel_shards = True processes = multiprocessing.cpu_count() query_stats = [ 'baseline' if filtered is False else 'filtered', shards_prefix, col_name, sort_order, limit ] sql = 'select * from S3Object;' if filtered: sql = '''select l_extendedprice from S3Object;''' sort_index = '_0' print("\n\nBaseline TopK with order {} on field {}\n".format( sort_order, col_name)) query_plan = QueryPlan(is_async=True) # Query plan # ts = query_plan.add_operator( # SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False)) sort_exp = SortExpression(sort_index, col_type, sort_order) top_op = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk', query_plan, False)) for process in range(processes): proc_parts = [ x for x in range(shards_start, shards_end + 1) if x % processes == process ] pc = query_plan.add_operator( SQLShardedTableScan(table_name, sql, use_pandas, True, False, "topk_table_scan_parts_{}".format(proc_parts), proc_parts, shards_prefix, parallel_shards, query_plan, False)) # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts)) pc_top = query_plan.add_operator( Top(limit, sort_exp, use_pandas, 'topk_parts_{}'.format(proc_parts), query_plan, False)) pc.connect(pc_top) pc_top.connect(top_op) c = query_plan.add_operator(Collate('collate', query_plan, False)) top_op.connect(c) # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results for t in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) # assert num_rows == limit + 1 cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] query_stats += [ 0, 0, query_plan.total_elapsed_time, query_plan.total_elapsed_time, 0, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost, num_rows == limit + 1 ] stats.append(query_stats) # Write the metrics query_plan.print_metrics() query_plan.stop()
def run_memory_indexed_sampling(stats, sort_field_index, sort_field, k, sample_size, batch_size, parallel, use_pandas, sort_order, buffer_size, table_parts_start, table_parts_end, tbl_s3key, shards_path, format_, sampling_only=True): """ Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold Finally scanning the table to retrieve only the records beyond the threshold :return: """ secure = False use_native = False n_threads = multiprocessing.cpu_count() print('') print( "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}" .format(sort_field, sort_order, k, sample_size, batch_size)) print("----------------------") stats += [ 'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path, sort_field, sort_order, k, sample_size, batch_size ] # Query plan query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size) # Sampling tbl_smpler = query_plan.add_operator( TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size, "table_sampler", query_plan, False)) sample_scanners = map( lambda p: query_plan.add_operator( TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) map(lambda op: op.set_nthreads(n_threads), sample_scanners) # sample_scan = query_plan.add_operator( # TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p), # query_plan, False)) # sample_scan.set_nthreads(n_threads) # Sampling project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df = df[[sort_field]].astype(np.float, errors='ignore') return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] sample_project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # sample_project = query_plan.add_operator( # Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn)) # TopK samples sort_expr = [SortExpression(sort_field, float, sort_order)] sample_topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # sample_topk = query_plan.add_operator( # Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False)) sample_topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False)) # Generate SQL command for second scan sql_gen = query_plan.add_operator( TopKFilterBuild(sort_order, 'float', 'select * from S3object ', ' CAST({} as float) '.format(sort_field), 'sample_sql_gen', query_plan, False)) if not sampling_only: # Scan scan = map( lambda p: query_plan.add_operator( SQLTableScan("{}.{}".format(shards_path, p), "", format_, use_pandas, secure, use_native, 'scan_{}'.format( p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # Project def project_fn(df): df.columns = [ sort_field if x == sort_field_index else x for x in df.columns ] df[[sort_field]] = df[[sort_field]].astype(np.float) return df project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] project = map( lambda p: query_plan.add_operator( Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)), range(table_parts_start, table_parts_end + 1)) # TopK topk = map( lambda p: query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)), range(table_parts_start, table_parts_end + 1)) # TopK reduce topk_reduce = query_plan.add_operator( Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) collate = query_plan.add_operator(Collate('collate', query_plan, False)) map(lambda op: tbl_smpler.connect(op), sample_scanners) map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scanners)) map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project)) map(lambda o: o.connect(sample_topk_reduce), sample_topk) sample_topk_reduce.connect(sql_gen) if not sampling_only: map(lambda (p, o): sql_gen.connect(o), enumerate(scan)) map(lambda (p, o): o.connect(project[p]), enumerate(scan)) map(lambda (p, o): o.connect(topk[p]), enumerate(project)) map(lambda (p, o): o.connect(topk_reduce), enumerate(topk)) topk_reduce.connect(collate) else: sql_gen.connect(collate) # Start the query query_plan.execute() print('Done') # tuples = collate.tuples() # collate.print_tuples(tuples) # Write the metrics query_plan.print_metrics() # Shut everything down # query_plan.stop() sampling_threshold = query_plan.retrieve_sampling_threshold() sampling_runtime = query_plan.get_phase_runtime('sampl') sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost( 'sampl') sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost( phase_keyword='sampl') sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost( 'sampl') total_runtime = query_plan.total_elapsed_time total_http_requests, total_requests_cost = query_plan.requests_cost() total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost( ) total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost() total_data_cost = query_plan.data_cost()[0] total_computation_cost = query_plan.computation_cost() total_cost = query_plan.cost()[0] stats += [ sampling_threshold, sampling_runtime, total_runtime - sampling_runtime, total_runtime, sampling_returned_rows, sampling_scanned_bytes * BYTE_TO_MB, sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests, sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost, total_returned_rows, total_scanned_bytes * BYTE_TO_MB, total_returned_bytes * BYTE_TO_MB, total_http_requests, total_requests_cost, total_transfer_cost, total_scan_cost, total_data_cost, total_computation_cost, total_cost ]
def topk_with_sampling(stats, k, k_scale=1, sort_index='_5', col_type=float, sort_field='l_extendedprice', sort_order='DESC', use_pandas=True, filtered=False, conservative=False, table_name='tpch-sf1/lineitem.csv', shards_prefix='tpch-sf1/lineitem_sharded', shards_start=0, shards_end=31): """ Executes the optimized topk query by firstly retrieving the first k tuples. Based on the retrieved tuples, table scan operator gets only the tuples larger/less than the most significant tuple in the sample :return: """ limit = k num_rows = 0 parallel_shards = True processes = multiprocessing.cpu_count() query_stats = [ 'sampling_{}_{}'.format( 'conservative' if conservative else 'aggressive', 'filtered' if filtered else 'non-filtered'), shards_prefix, sort_field, sort_order, limit * k_scale, limit * k_scale ] sql = 'select * from S3Object;' if filtered: sql = '''select l_extendedprice from S3Object;''' sort_index = '_0' print("\n\nSampling params:") print("Scale: {}, Sort Field: {}, Sort Order: {}\n".format( k_scale, sort_field, sort_order)) query_plan = QueryPlan(is_async=True) # Query plan ts = query_plan.add_operator( TopKTableScan( table_name, sql, use_pandas, True, False, limit, k_scale, SortExpression(sort_index, col_type, sort_order, sort_field), conservative, shards_start, shards_end, parallel_shards, shards_prefix, processes, 'topk_table_scan', query_plan, False)) c = query_plan.add_operator(Collate('collate', query_plan, False)) ts.connect(c) # Write the plan graph # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id()) # Start the query query_plan.execute() # Assert the results for t in c.tuples(): num_rows += 1 # print("{}:{}".format(num_rows, t)) # assert num_rows == limit + 1 cost, bytes_scanned, bytes_returned, rows = query_plan.cost() computation_cost = query_plan.computation_cost() data_cost = query_plan.data_cost()[0] query_stats += [ ts.msv, ts.op_metrics.sampling_time, query_plan.total_elapsed_time, ts.op_metrics.sampling_time + query_plan.total_elapsed_time, # 0 if num_rows >= limit else 1, rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost #, # num_rows == limit + 1 ] stats.append(query_stats) # Write the metrics query_plan.print_metrics() query_plan.stop() topk_op = query_plan.operators["topk_table_scan"] max_table_scan, _ = max([(op, op.op_metrics.elapsed_time()) for op in query_plan.operators.values()], key=lambda tup: tup[1]) OpMetrics.print_overall_metrics( [max_table_scan, topk_op, topk_op.sample_op], "TopKTableScan total time")