def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext(dask_client=None) if file_format == 'psv': dtypes = get_dtypes(table_name) col_names = get_column_names(table_name) bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names) else: bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: result = bc.sql(f'select distinct({partition}) from {table_name}') if type(result) is dask_cudf.core.DataFrame: result = result.compute() valuesPartition = result.to_pandas().to_dict() finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition])) dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))} valuesPartition[partition] = dictOfvalues data_partition_array_dict.append(valuesPartition) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)
def testing_load_hive_table(table_name, location, partitions, partitions_schema): bc = BlazingContext() bc.create_table(table_name, location, file_format='parquet', hive_table_name=table_name, partitions=partitions, partitions_schema=partitions_schema)
def testing_load_hive_table(table_name, file_format, location, partitions, partitions_schema): bc2 = BlazingContext(dask_client=None) if file_format == 'psv': dtypes = get_dtypes(table_name) col_names = get_column_names(table_name) bc2.create_table(table_name, location, file_format='csv', hive_table_name=table_name, partitions=partitions, partitions_schema=partitions_schema, delimiter="|", dtype=dtypes, names=col_names) else: bc2.create_table(table_name, location, file_format=file_format)
def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext() bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: values = bc.sql(f'select distinct({partition}) from {table_name}') data_partition_array_dict.append(values.to_pandas().to_dict()) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)
class BlazingSQLHelper: def __init__(self): cluster = LocalCUDACluster() client = Client(cluster) self._bc = BlazingContext(dask_client=client, network_interface='lo') """This function runs blazingSQL query. :param config: Query related tables configuration. :type config: dict :return: Query results. :rtype: cudf.DataFrame """ def run_query(self, config): for table in config["tables"]: table_name = table["table_name"] file_path = table["input_path"] kwargs = table.copy() del kwargs["table_name"] del kwargs["input_path"] self._bc.create_table(table_name, file_path, **kwargs) sql = config["sql"] log.debug("Executing query: %s" % (sql)) result = self._bc.sql(sql) result = result.compute() return result """This function drops blazingSQL tables. :param table_names: List of table names to drop. :type table_names: List """ def drop_table(self, table_names): for table_name in table_names: log.debug("Drop table: %s" % (table_name)) self._bc.drop_table(table_name)
class BlazingSQLHelper: def __init__(self, pool=False): # Setting pool=True allocates half the GPU memory. self._bc = BlazingContext(pool=pool) """This function runs blazingSQL query. :param config: Query related tables configuration. :type config: dict :return: Query results. :rtype: cudf.DataFrame """ def run_query(self, config): for table in config["tables"]: table_name = table["table_name"] file_path = table["input_path"] kwargs = table.copy() del kwargs["table_name"] del kwargs["input_path"] self._bc.create_table(table_name, file_path, **kwargs) sql = config["sql"] log.debug("Executing query: %s" % (sql)) result = self._bc.sql(sql) self.has_data = False return result """This function drops blazingSQL tables. :param table_names: List of table names to drop. :type table_names: List """ def drop_table(self, table_names): for table_name in table_names: log.debug("Drop table: %s" % (table_name)) self._bc.drop_table(table_name)
def main(drill, dir_data_file, nRals): table1 = "customer" table2 = "orders" table3 = "lineitem" #Read Data TPCH------------------------------------------------------------------------------------------------------------ customer_gdf = cs.read_data(table1, dir_data_file) orders_gdf = cs.read_data(table2, dir_data_file) lineitem_gdf = cs.read_data(table3, dir_data_file) #Create Tables ------------------------------------------------------------------------------------------------------------ bc = BlazingContext() bc.create_table('customer', customer_gdf) bc.create_table('orders', orders_gdf) bc.create_table('lineitem', lineitem_gdf) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 queryType = 'Performance Test' print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select min(l_orderkey) from lineitem" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select sum(o_custkey), avg(o_totalprice), min(o_custkey), max(o_orderkey), count(o_orderkey) from orders" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = "select max(l_partkey) from lineitem group by l_orderkey" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = "select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_05' query = "select max(l_partkey) from lineitem group by l_orderkey, l_suppkey, l_linenumber" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = """select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey, l_suppkey, l_linenumber""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = "select l_orderkey from lineitem group by l_orderkey" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = "select l_orderkey, l_extendedprice, l_shipdate from lineitem group by l_orderkey, l_extendedprice, l_shipdate, l_linestatus" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_09' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from orders as o inner join lineitem as l on o.o_orderkey = l.l_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_11' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_nationkey = o.o_custkey and l.l_linenumber = c.c_custkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' query = """select o.o_totalprice, l.l_partkey from orders as o left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_13' query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c left outer join orders as o on c.c_custkey = o.o_custkey left outer join lineitem as l on l.l_orderkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_14' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o full outer join lineitem as l on l.l_orderkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_15' query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c full outer join orders as o on c.c_custkey = o.o_custkey full outer join lineitem as l on l.l_orderkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_16' query = """select l_returnflag, l_shipdate, l_linestatus from lineitem where l_orderkey < 10000 union all select l_returnflag, l_shipdate, l_linestatus from lineitem where l_partkey > 100""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' query = """select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderkey < 10000 union all select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_custkey < 100000 union all select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderstatus = 'O' union all select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_totalprice < 350""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_18' query = """select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders where o_custkey<>4318470""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_19' query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag<>'g packages.'" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_20' query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders where o_custkey=88910" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_21' query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag='N'" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_22' query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice from orders where o_orderkey=1000" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_23' query = "select l_orderkey, l_partkey, l_suppkey, l_comment from lineitem where l_comment='dolites wake'" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_24' query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 15000 and c_nationkey = 5 or c_custkey = c_nationkey * c_nationkey or c_nationkey >= 10 or c_acctbal <= 500""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_25' # query = "select o_orderkey, o_orderstatus, o_totalprice, o_comment from orders where o_orderstatus = 'O'" # runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_26' query = "select ((l_orderkey + l_partkey)/2)* l_quantity - l_suppkey*(l_discount - 1000.999) from lineitem" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_27' query = """select (l_orderkey + l_partkey*l_quantity)/2, (l_orderkey + l_quantity - l_partkey)*(l_orderkey + 3), l_orderkey/l_partkey - l_quantity, l_quantity*l_linenumber/l_partkey*l_tax from lineitem""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_28' query = """select EXTRACT(YEAR FROM l_receiptdate) - EXTRACT(YEAR FROM l_shipdate) as years_late, EXTRACT(MONTH FROM l_receiptdate) - EXTRACT(MONTH FROM l_shipdate) as months_late, EXTRACT(DAY FROM l_receiptdate) - EXTRACT(DAY FROM l_shipdate) as days_late from lineitem""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_29' query = """select l_quantity, sin(l_quantity), cos(l_quantity), asin(l_quantity), acos(l_quantity), ln(l_quantity), tan(l_quantity), atan(l_quantity), floor(l_quantity), ceil(l_quantity) from lineitem""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_30' query = "select o_totalprice, o_custkey + o_custkey from orders order by o_totalprice" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_31' query = "select o_orderkey, o_custkey, o_orderstatus from orders order by o_comment" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_32' query = "select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderkey, o_custkey, o_totalprice" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_33' query = """select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderstatus, o_shippriority, o_comment""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_34' query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1 desc, 2" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_35' query = """select custOrders.avgPrice, custOrders.numOrders from customer inner join (select o_custkey as o_custkey, avg(o_totalprice) as avgPrice, count(o_totalprice) as numOrders from orders where o_custkey <= 100 group by o_custkey) as custOrders on custOrders.o_custkey = customer.c_custkey where customer.c_nationkey <= 5""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_36' query = """with ordersTemp as ( select min(o_orderkey) as priorityKey, o_custkey from orders group by o_custkey ), ordersjoin as( select orders.o_custkey from orders inner join ordersTemp on ordersTemp.priorityKey = orders.o_orderkey ) select customer.c_custkey, customer.c_nationkey from customer inner join ordersjoin on ordersjoin.o_custkey = customer.c_custkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)
import subprocess # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() authority = 'localhost:54310' hdfs_host = 'localhost' hdfs_port = 54310 hdfs_driver = 'libhdfs' result, error_msg, fs = bc.hdfs(authority, host=hdfs_host, port=hdfs_port, user='******', driver=hdfs_driver) cursor = hive.connect('localhost').cursor() table = bc.create_table('ptransactions', cursor, file_format='parquet') for i in range(11): query = "SELECT * FROM ptransactions where t_year=2017 and t_company_id={t_company_id} LIMIT 10".format( t_company_id=i) ddf = bc.sql(query) print(query) if isinstance(ddf, cudf.DataFrame): print(ddf) else: print(ddf.compute())
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client client = Client('127.0.0.1:8786') client.restart() bc = BlazingContext(dask_client=client, network_interface="lo") # bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/100MB2Part/' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) bc.create_table('customer', dir_data_fs + '/customer_*.parquet') # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = "select sum(c_custkey)/count(c_custkey), min(c_custkey) from customer limit 5" # [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment'] lp = bc.explain(query)
from blazingsql import BlazingContext bc = BlazingContext() company_types = ['int64', 'str', 'date', 'str', 'str', 'str'] bc.create_table('companies', '/home/nomis/20GB_CSV/Companies.csv', dtype=company_types) address_types = ['int64', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'int64'] bc.create_table('addresses', '/home/nomis/20GB_CSV/Addresses.csv', dtype=address_types) department_types = ['int64', 'str', 'int64'] bc.create_table('departments', '/home/nomis/20GB_CSV/Departments.csv', dtype=department_types) category_types = ['int64', 'str', 'int64'] bc.create_table('categories', '/home/nomis/20GB_CSV/Categories.csv', dtype=category_types) product_types = ['int64', 'str', 'str', 'str', 'str', 'date', 'float64', 'int64'] bc.create_table('products', '/home/nomis/20GB_CSV/Products.csv', dtype=product_types) review_types = ['int64', 'str', 'str', 'date', 'int64'] bc.create_table('reviews', '/home/nomis/20GB_CSV/Reviews.csv', dtype=review_types) query = ''' SELECT a.City FROM Addresses a JOIN Companies c ON a.CompanyId = c.Id JOIN Departments d ON c.Id = d.CompanyId JOIN Categories k ON d.Id = k.DepartmentId JOIN Products p ON k.Id = p.CategoryId JOIN Reviews r ON p.Id = r.ProductId WHERE YEAR(r.PublishDate) >= '2021' GROUP BY a.City ''' af = bc.sql(query)
from blazingsql import BlazingContext from dask.distributed import Client # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/DataSet5Part100MB' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) bc.create_table('customer', [ dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet', dir_data_fs + '/customer_3_0.parquet', dir_data_fs + '/customer_4_0.parquet' ]) # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = """select c_custkey, c_nationkey, c_acctbal from
from dask.distributed import Client from blazingsql import BlazingContext from dask_cuda import LocalCUDACluster # initalize BlazingContext with the Dask Client of local GPUs to distribute query execution bc = BlazingContext(dask_client=Client(LocalCUDACluster()), network_interface='lo') # register public AWS S3 bucket bc.s3('blazingsql-colab', bucket_name='blazingsql-colab') # create a table from that S3 bucket col_names = [ 'key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count' ] bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names) # query the table & write results locally as parquet bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client from collections import OrderedDict from blazingsql import BlazingContext client = Client('127.0.0.1:8786') bc = BlazingContext(dask_client=client, network_interface="lo") bc.create_table('customer', '/home/aocsa/tpch/100MB2Part/tpch/customer_0_0.parquet') bc.create_table('orders', '/home/aocsa/tpch/100MB2Part/tpch/orders_*.parquet') algebra = """LogicalProject(c_custkey=[$0], c_nationkey=[$3]) LogicalFilter(condition=[<($0, 10)]) LogicalTableScan(table=[[main, customer]]) """ plan = """ { 'expr': 'LogicalProject(c_custkey=[$0], c_nationkey=[$3])', 'children': [ { 'expr': 'LogicalFilter(condition=[<($0, 10)])', 'children': [ { 'expr': 'LogicalTableScan(table=[[main, customer]])', 'children': [] }
import pprint from blazingsql import BlazingContext from dask.distributed import Client # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) path_parquet = dir_data_fs + 'simple.parquet' print(path_parquet) bc.create_table('simple', path_parquet) # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = """select * from simple where int64_field < 300000 or int64_field > 900000
import pprint from blazingsql import BlazingContext from dask.distributed import Client client = Client('127.0.0.1:8786') client.restart() bc = BlazingContext(dask_client=client, network_interface="lo") # bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/100MB2Part/tpch' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) bc.create_table('part', dir_data_fs + '/part_*.parquet') # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = "select count(p_partkey), sum(p_partkey), max(p_partkey), min(p_partkey) from part where p_partkey < 100" # [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment'] lp = bc.explain(query)