コード例 #1
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size,
        table_first_part, table_parts, queried_columns, select_columns, path,
        format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    sample_scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(
                    path, p), 'select {} from S3Object;'.format(
                        sort_field), format_, use_pandas, secure, use_native,
                'column_scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Sampling project
    def project_fn1(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn1)),
        range(table_first_part, table_first_part + table_parts))

    # TopK samples
    sort_expr = SortExpression(sort_field, float, sort_order)

    sample_topk = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(
            sort_order,
            'float',
            'select {} from S3object '.format(select_columns),
            #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False ))
            ' {} '.format(sort_field),
            'sql_gen',
            query_plan,
            False))

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "",
                format_, use_pandas, secure, use_native, 'scan_{}'.format(
                    p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Project
    def project_fn2(df):
        df.columns = queried_columns
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn2)),
        range(table_first_part, table_first_part + table_parts))

    # TopK
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p),
                query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project))
    sample_topk.connect(sql_gen)

    map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
コード例 #2
0
def run_local_indexed_sampling(stats,
                               sort_field_index,
                               sort_field,
                               k,
                               sample_size,
                               batch_size,
                               parallel,
                               use_pandas,
                               sort_order,
                               buffer_size,
                               table_parts_start,
                               table_parts_end,
                               tbl_s3key,
                               shards_path,
                               format_,
                               sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))
    sample_scan = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda (i, op): sample_scan[i].set_nthreads(n_threads),
        enumerate(sample_scan))

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))
    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda o: tbl_smpler.connect(o), sample_scan)
    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts_end - table_parts_start))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
コード例 #3
0
def run_head_table_sampling(stats,
                            sort_field_index,
                            sort_field,
                            k,
                            sample_size,
                            parallel,
                            use_pandas,
                            sort_order,
                            buffer_size,
                            table_parts_start,
                            table_parts_end,
                            tbl_s3key,
                            shards_path,
                            format_,
                            sampling_only=True):

    secure = False
    use_native = False
    print('')
    print(
        "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}"
        .format(sort_field, sort_order, k, sample_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, 1
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)
    sample_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}.{}".format(shards_path, p),
                'select {} from S3Object limit {};'.format(
                    sort_field, per_part_samples), format_, use_pandas, secure,
                use_native, 'sample_scan_{}'.format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Sampling project
    def project_fn(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda op: op.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
コード例 #4
0
def run_memory_indexed_sampling(stats,
                                sort_field_index,
                                sort_field,
                                k,
                                sample_size,
                                batch_size,
                                parallel,
                                use_pandas,
                                sort_order,
                                buffer_size,
                                table_parts_start,
                                table_parts_end,
                                tbl_s3key,
                                shards_path,
                                format_,
                                sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print(
        "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}"
        .format(sort_field, sort_order, k, sample_size, batch_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))

    sample_scanners = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda op: op.set_nthreads(n_threads), sample_scanners)

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field),
                        'sample_sql_gen', query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda op: tbl_smpler.connect(op), sample_scanners)
    map(lambda (p, o): o.connect(sample_project[p]),
        enumerate(sample_scanners))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    # query_plan.stop()

    sampling_threshold = query_plan.retrieve_sampling_threshold()
    sampling_runtime = query_plan.get_phase_runtime('sampl')
    sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost(
        'sampl')
    sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost(
        phase_keyword='sampl')
    sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost(
        'sampl')

    total_runtime = query_plan.total_elapsed_time
    total_http_requests, total_requests_cost = query_plan.requests_cost()
    total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost(
    )
    total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost()

    total_data_cost = query_plan.data_cost()[0]
    total_computation_cost = query_plan.computation_cost()
    total_cost = query_plan.cost()[0]

    stats += [
        sampling_threshold, sampling_runtime, total_runtime - sampling_runtime,
        total_runtime, sampling_returned_rows,
        sampling_scanned_bytes * BYTE_TO_MB,
        sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests,
        sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost,
        total_returned_rows, total_scanned_bytes * BYTE_TO_MB,
        total_returned_bytes * BYTE_TO_MB, total_http_requests,
        total_requests_cost, total_transfer_cost, total_scan_cost,
        total_data_cost, total_computation_cost, total_cost
    ]