Esempio n. 1
0
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel,
                      use_pandas, sort_order, buffer_size, table_parts_start,
                      table_parts_end, tbl_s3key, format_, shards_path):

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format(
        sort_field, sort_order, k))
    print("----------------------")

    stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan("{}.{}".format(shards_path, p
                                        ), "select * from S3Object;", format_,
                         use_pandas, secure, use_native, 'scan_{}'.format(
                             p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                False)), range(table_parts_start, table_parts_end + 1))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    query_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost,
        computation_cost, cost
    ]
Esempio n. 2
0
def run_head_table_sampling(stats,
                            sort_field_index,
                            sort_field,
                            k,
                            sample_size,
                            parallel,
                            use_pandas,
                            sort_order,
                            buffer_size,
                            table_parts_start,
                            table_parts_end,
                            tbl_s3key,
                            shards_path,
                            format_,
                            sampling_only=True):

    secure = False
    use_native = False
    print('')
    print(
        "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}"
        .format(sort_field, sort_order, k, sample_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, 1
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)
    sample_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}.{}".format(shards_path, p),
                'select {} from S3Object limit {};'.format(
                    sort_field, per_part_samples), format_, use_pandas, secure,
                use_native, 'sample_scan_{}'.format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Sampling project
    def project_fn(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda op: op.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
Esempio n. 3
0
def run_local_indexed_sampling(stats,
                               sort_field_index,
                               sort_field,
                               k,
                               sample_size,
                               batch_size,
                               parallel,
                               use_pandas,
                               sort_order,
                               buffer_size,
                               table_parts_start,
                               table_parts_end,
                               tbl_s3key,
                               shards_path,
                               format_,
                               sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))
    sample_scan = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda (i, op): sample_scan[i].set_nthreads(n_threads),
        enumerate(sample_scan))

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))
    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda o: tbl_smpler.connect(o), sample_scan)
    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts_end - table_parts_start))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
Esempio n. 4
0
def run_memory_indexed_sampling(stats,
                                sort_field_index,
                                sort_field,
                                k,
                                sample_size,
                                batch_size,
                                parallel,
                                use_pandas,
                                sort_order,
                                buffer_size,
                                table_parts_start,
                                table_parts_end,
                                tbl_s3key,
                                shards_path,
                                format_,
                                sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print(
        "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}"
        .format(sort_field, sort_order, k, sample_size, batch_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))

    sample_scanners = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda op: op.set_nthreads(n_threads), sample_scanners)

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field),
                        'sample_sql_gen', query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda op: tbl_smpler.connect(op), sample_scanners)
    map(lambda (p, o): o.connect(sample_project[p]),
        enumerate(sample_scanners))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    # query_plan.stop()

    sampling_threshold = query_plan.retrieve_sampling_threshold()
    sampling_runtime = query_plan.get_phase_runtime('sampl')
    sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost(
        'sampl')
    sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost(
        phase_keyword='sampl')
    sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost(
        'sampl')

    total_runtime = query_plan.total_elapsed_time
    total_http_requests, total_requests_cost = query_plan.requests_cost()
    total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost(
    )
    total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost()

    total_data_cost = query_plan.data_cost()[0]
    total_computation_cost = query_plan.computation_cost()
    total_cost = query_plan.cost()[0]

    stats += [
        sampling_threshold, sampling_runtime, total_runtime - sampling_runtime,
        total_runtime, sampling_returned_rows,
        sampling_scanned_bytes * BYTE_TO_MB,
        sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests,
        sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost,
        total_returned_rows, total_scanned_bytes * BYTE_TO_MB,
        total_returned_bytes * BYTE_TO_MB, total_http_requests,
        total_requests_cost, total_transfer_cost, total_scan_cost,
        total_data_cost, total_computation_cost, total_cost
    ]
Esempio n. 5
0
def topk_baseline(stats,
                  k,
                  sort_index='_5',
                  col_type=float,
                  col_name='l_extendedprice',
                  sort_order='DESC',
                  use_pandas=True,
                  filtered=False,
                  table_name='tpch-sf1/lineitem.csv',
                  shards_prefix='tpch-sf1/lineitem_sharded',
                  shards_start=0,
                  shards_end=31):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    limit = k
    num_rows = 0
    parallel_shards = True
    processes = multiprocessing.cpu_count()

    query_stats = [
        'baseline' if filtered is False else 'filtered', shards_prefix,
        col_name, sort_order, limit
    ]

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_extendedprice
                  from S3Object;'''
        sort_index = '_0'

    print("\n\nBaseline TopK with order {} on field {}\n".format(
        sort_order, col_name))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    # ts = query_plan.add_operator(
    #     SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False))
    sort_exp = SortExpression(sort_index, col_type, sort_order)
    top_op = query_plan.add_operator(
        Top(limit, sort_exp, use_pandas, 'topk', query_plan, False))
    for process in range(processes):
        proc_parts = [
            x for x in range(shards_start, shards_end + 1)
            if x % processes == process
        ]
        pc = query_plan.add_operator(
            SQLShardedTableScan(table_name, sql, use_pandas, True, False,
                                "topk_table_scan_parts_{}".format(proc_parts),
                                proc_parts, shards_prefix, parallel_shards,
                                query_plan, False))
        # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts))
        pc_top = query_plan.add_operator(
            Top(limit, sort_exp, use_pandas,
                'topk_parts_{}'.format(proc_parts), query_plan, False))
        pc.connect(pc_top)
        pc_top.connect(top_op)

    c = query_plan.add_operator(Collate('collate', query_plan, False))

    top_op.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    # assert num_rows == limit + 1

    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    query_stats += [
        0, 0, query_plan.total_elapsed_time, query_plan.total_elapsed_time, 0,
        rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost,
        num_rows == limit + 1
    ]
    stats.append(query_stats)

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
Esempio n. 6
0
def topk_with_sampling(stats,
                       k,
                       k_scale=1,
                       sort_index='_5',
                       col_type=float,
                       sort_field='l_extendedprice',
                       sort_order='DESC',
                       use_pandas=True,
                       filtered=False,
                       conservative=False,
                       table_name='tpch-sf1/lineitem.csv',
                       shards_prefix='tpch-sf1/lineitem_sharded',
                       shards_start=0,
                       shards_end=31):
    """
    Executes the optimized topk query by firstly retrieving the first k tuples.
    Based on the retrieved tuples, table scan operator gets only the tuples larger/less than the most significant
    tuple in the sample
    :return:
    """

    limit = k
    num_rows = 0
    parallel_shards = True
    processes = multiprocessing.cpu_count()

    query_stats = [
        'sampling_{}_{}'.format(
            'conservative' if conservative else 'aggressive',
            'filtered' if filtered else 'non-filtered'), shards_prefix,
        sort_field, sort_order, limit * k_scale, limit * k_scale
    ]

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_extendedprice
              from S3Object;'''
        sort_index = '_0'

    print("\n\nSampling params:")
    print("Scale: {}, Sort Field: {}, Sort Order: {}\n".format(
        k_scale, sort_field, sort_order))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    ts = query_plan.add_operator(
        TopKTableScan(
            table_name, sql, use_pandas, True, False, limit, k_scale,
            SortExpression(sort_index, col_type, sort_order, sort_field),
            conservative, shards_start, shards_end, parallel_shards,
            shards_prefix, processes, 'topk_table_scan', query_plan, False))
    c = query_plan.add_operator(Collate('collate', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    # assert num_rows == limit + 1

    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    query_stats += [
        ts.msv,
        ts.op_metrics.sampling_time,
        query_plan.total_elapsed_time,
        ts.op_metrics.sampling_time + query_plan.total_elapsed_time,
        # 0 if num_rows >= limit else 1,
        rows,
        bytes_scanned,
        bytes_returned,
        data_cost,
        computation_cost,
        cost  #,
        # num_rows == limit + 1
    ]
    stats.append(query_stats)

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
    topk_op = query_plan.operators["topk_table_scan"]
    max_table_scan, _ = max([(op, op.op_metrics.elapsed_time())
                             for op in query_plan.operators.values()],
                            key=lambda tup: tup[1])
    OpMetrics.print_overall_metrics(
        [max_table_scan, topk_op, topk_op.sample_op],
        "TopKTableScan total time")