def init_context(): bc = None dask_client = None nRals = int(Settings.data["RunSettings"]["nRals"]) nGpus = int(Settings.data["RunSettings"]["nGPUs"]) if nRals == 1: bc = BlazingContext() else: os.chdir(Settings.data["TestSettings"]["logDirectory"]) dask_client = try_to_get_dask_client(nRals, nGpus) if dask_client is not None: dask_conn = Settings.data["TestSettings"]["daskConnection"] iface = Settings.data["RunSettings"]["networkInterface"] print("Using dask: " + dask_conn) if "local" != dask_conn: dask_client.restart() bc = BlazingContext( dask_client=dask_client, network_interface=iface, pool=False, initial_pool_size=None, allocator="managed", ) else: # Fallback: could not found a valid dask server bc = BlazingContext() return (bc, dask_client)
def init_context(config_options={}): bc = None dask_client = None nRals = int(Settings.data["RunSettings"]["nRals"]) nGpus = int(Settings.data["RunSettings"]["nGPUs"]) if nRals == 1: bc = BlazingContext(config_options=config_options) else: os.chdir(Settings.data["TestSettings"]["logDirectory"]) iface = Settings.data["RunSettings"]["networkInterface"] dask_client = try_to_get_dask_client(nRals, nGpus, iface) if dask_client is not None: dask_conn = Settings.data["TestSettings"]["daskConnection"] # print("Using dask: " + dask_conn) # if "local" != dask_conn: bc = BlazingContext( dask_client=dask_client, network_interface=iface, # pool=True, # initial_pool_size=300000000, allocator="default", config_options=config_options, ) else: # Fallback: could not found a valid dask server bc = BlazingContext(config_options=config_options) return (bc, dask_client)
def testing_load_hive_table(table_name, location, partitions, partitions_schema): bc = BlazingContext() bc.create_table(table_name, location, file_format='parquet', hive_table_name=table_name, partitions=partitions, partitions_schema=partitions_schema)
def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext(dask_client=None) if file_format == 'psv': dtypes = get_dtypes(table_name) col_names = get_column_names(table_name) bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names) else: bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: result = bc.sql(f'select distinct({partition}) from {table_name}') if type(result) is dask_cudf.core.DataFrame: result = result.compute() valuesPartition = result.to_pandas().to_dict() finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition])) dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))} valuesPartition[partition] = dictOfvalues data_partition_array_dict.append(valuesPartition) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)
def create_blazing_context(sched): from blazingsql import BlazingContext from dask.distributed import Client from dask_cuda import LocalCUDACluster if not sched: cluster = LocalCUDACluster() # Need dev version 1.18+ of reticulate # Error: C stack usage 193896484868 is too close to the limit client = Client(cluster) else: client = Client(sched) ctx = BlazingContext(dask_client=client, network_interface='lo') # FIX: develop better client handling on the R side ctx.dors_client = client return ctx
def init_context(): bc = None dask_client = None nRals = int(Settings.data['RunSettings']['nRals']) nGpus = int(Settings.data['RunSettings']['nGPUs']) if nRals == 1: bc = BlazingContext() else: os.chdir(Settings.data['TestSettings']['logDirectory']) dask_client = try_to_get_dask_client(nRals, nGpus) if dask_client != None: dask_conn = Settings.data['TestSettings']['daskConnection'] iface = Settings.data['RunSettings']['networkInterface'] print("Using dask: " + dask_conn) if 'local' != dask_conn: dask_client.restart() bc = BlazingContext( dask_client = dask_client, network_interface=iface, pool=False, initial_pool_size=None, enable_logging=False, allocator="managed") else: # Fallback: could not found a valid dask server bc = BlazingContext() return (bc, dask_client)
def init_context( useProgressBar: bool = False, config_options={ "ENABLE_GENERAL_ENGINE_LOGS": True, "ENABLE_COMMS_LOGS": True, "ENABLE_TASK_LOGS": True, "ENABLE_OTHER_ENGINE_LOGS": True }): bc = None dask_client = None nRals = int(Settings.data["RunSettings"]["nRals"]) nGpus = int(Settings.data["RunSettings"]["nGPUs"]) if nRals == 1: bc = BlazingContext(config_options=config_options, enable_progress_bar=True) else: os.chdir(Settings.data["TestSettings"]["logDirectory"]) iface = Settings.data["RunSettings"]["networkInterface"] dask_client = try_to_get_dask_client(nRals, nGpus, iface) if dask_client is not None: dask_conn = Settings.data["TestSettings"]["daskConnection"] # print("Using dask: " + dask_conn) # if "local" != dask_conn: bc = BlazingContext( dask_client=dask_client, network_interface=iface, # pool=True, # initial_pool_size=300000000, allocator="default", config_options=config_options, enable_progress_bar=True) else: # Fallback: could not found a valid dask server bc = BlazingContext(config_options=config_options, enable_progress_bar=True) return (bc, dask_client)
class BlazingSQLHelper: def __init__(self): cluster = LocalCUDACluster() client = Client(cluster) self._bc = BlazingContext(dask_client=client, network_interface='lo') """This function runs blazingSQL query. :param config: Query related tables configuration. :type config: dict :return: Query results. :rtype: cudf.DataFrame """ def run_query(self, config): for table in config["tables"]: table_name = table["table_name"] file_path = table["input_path"] kwargs = table.copy() del kwargs["table_name"] del kwargs["input_path"] self._bc.create_table(table_name, file_path, **kwargs) sql = config["sql"] log.debug("Executing query: %s" % (sql)) result = self._bc.sql(sql) result = result.compute() return result """This function drops blazingSQL tables. :param table_names: List of table names to drop. :type table_names: List """ def drop_table(self, table_names): for table_name in table_names: log.debug("Drop table: %s" % (table_name)) self._bc.drop_table(table_name)
class BlazingSQLHelper: def __init__(self, pool=False): # Setting pool=True allocates half the GPU memory. self._bc = BlazingContext(pool=pool) """This function runs blazingSQL query. :param config: Query related tables configuration. :type config: dict :return: Query results. :rtype: cudf.DataFrame """ def run_query(self, config): for table in config["tables"]: table_name = table["table_name"] file_path = table["input_path"] kwargs = table.copy() del kwargs["table_name"] del kwargs["input_path"] self._bc.create_table(table_name, file_path, **kwargs) sql = config["sql"] log.debug("Executing query: %s" % (sql)) result = self._bc.sql(sql) self.has_data = False return result """This function drops blazingSQL tables. :param table_names: List of table names to drop. :type table_names: List """ def drop_table(self, table_names): for table_name in table_names: log.debug("Drop table: %s" % (table_name)) self._bc.drop_table(table_name)
def testing_load_hive_table(table_name, file_format, location, partitions, partitions_schema): bc2 = BlazingContext(dask_client=None) if file_format == 'psv': dtypes = get_dtypes(table_name) col_names = get_column_names(table_name) bc2.create_table(table_name, location, file_format='csv', hive_table_name=table_name, partitions=partitions, partitions_schema=partitions_schema, delimiter="|", dtype=dtypes, names=col_names) else: bc2.create_table(table_name, location, file_format=file_format)
def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext() bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: values = bc.sql(f'select distinct({partition}) from {table_name}') data_partition_array_dict.append(values.to_pandas().to_dict()) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)
from dask.distributed import Client from blazingsql import BlazingContext from dask_cuda import LocalCUDACluster # initalize BlazingContext with the Dask Client of local GPUs to distribute query execution bc = BlazingContext(dask_client=Client(LocalCUDACluster()), network_interface='lo') # register public AWS S3 bucket bc.s3('blazingsql-colab', bucket_name='blazingsql-colab') # create a table from that S3 bucket col_names = [ 'key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count' ] bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names) # query the table & write results locally as parquet bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')
def __init__(self): cluster = LocalCUDACluster() client = Client(cluster) self._bc = BlazingContext(dask_client=client, network_interface='lo')
from blazingsql import BlazingContext bc = BlazingContext() company_types = ['int64', 'str', 'date', 'str', 'str', 'str'] bc.create_table('companies', '/home/nomis/20GB_CSV/Companies.csv', dtype=company_types) address_types = ['int64', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'int64'] bc.create_table('addresses', '/home/nomis/20GB_CSV/Addresses.csv', dtype=address_types) department_types = ['int64', 'str', 'int64'] bc.create_table('departments', '/home/nomis/20GB_CSV/Departments.csv', dtype=department_types) category_types = ['int64', 'str', 'int64'] bc.create_table('categories', '/home/nomis/20GB_CSV/Categories.csv', dtype=category_types) product_types = ['int64', 'str', 'str', 'str', 'str', 'date', 'float64', 'int64'] bc.create_table('products', '/home/nomis/20GB_CSV/Products.csv', dtype=product_types) review_types = ['int64', 'str', 'str', 'date', 'int64'] bc.create_table('reviews', '/home/nomis/20GB_CSV/Reviews.csv', dtype=review_types) query = ''' SELECT a.City FROM Addresses a JOIN Companies c ON a.CompanyId = c.Id JOIN Departments d ON c.Id = d.CompanyId JOIN Categories k ON d.Id = k.DepartmentId JOIN Products p ON k.Id = p.CategoryId JOIN Reviews r ON p.Id = r.ProductId WHERE YEAR(r.PublishDate) >= '2021' GROUP BY a.City ''' af = bc.sql(query)
def __init__(self, pool=False): # Setting pool=True allocates half the GPU memory. self._bc = BlazingContext(pool=pool)
def main(drill, dir_data_file, nRals): table1 = "customer" table2 = "orders" table3 = "lineitem" #Read Data TPCH------------------------------------------------------------------------------------------------------------ customer_gdf = cs.read_data(table1, dir_data_file) orders_gdf = cs.read_data(table2, dir_data_file) lineitem_gdf = cs.read_data(table3, dir_data_file) #Create Tables ------------------------------------------------------------------------------------------------------------ bc = BlazingContext() bc.create_table('customer', customer_gdf) bc.create_table('orders', orders_gdf) bc.create_table('lineitem', lineitem_gdf) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 queryType = 'Performance Test' print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select min(l_orderkey) from lineitem" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select sum(o_custkey), avg(o_totalprice), min(o_custkey), max(o_orderkey), count(o_orderkey) from orders" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = "select max(l_partkey) from lineitem group by l_orderkey" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = "select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_05' query = "select max(l_partkey) from lineitem group by l_orderkey, l_suppkey, l_linenumber" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = """select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey, l_suppkey, l_linenumber""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = "select l_orderkey from lineitem group by l_orderkey" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = "select l_orderkey, l_extendedprice, l_shipdate from lineitem group by l_orderkey, l_extendedprice, l_shipdate, l_linestatus" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_09' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from orders as o inner join lineitem as l on o.o_orderkey = l.l_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_11' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_nationkey = o.o_custkey and l.l_linenumber = c.c_custkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' query = """select o.o_totalprice, l.l_partkey from orders as o left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_13' query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c left outer join orders as o on c.c_custkey = o.o_custkey left outer join lineitem as l on l.l_orderkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_14' query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o full outer join lineitem as l on l.l_orderkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_15' query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c full outer join orders as o on c.c_custkey = o.o_custkey full outer join lineitem as l on l.l_orderkey = o.o_orderkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_16' query = """select l_returnflag, l_shipdate, l_linestatus from lineitem where l_orderkey < 10000 union all select l_returnflag, l_shipdate, l_linestatus from lineitem where l_partkey > 100""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' query = """select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderkey < 10000 union all select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_custkey < 100000 union all select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderstatus = 'O' union all select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_totalprice < 350""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_18' query = """select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders where o_custkey<>4318470""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_19' query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag<>'g packages.'" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_20' query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders where o_custkey=88910" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_21' query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag='N'" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_22' query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice from orders where o_orderkey=1000" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_23' query = "select l_orderkey, l_partkey, l_suppkey, l_comment from lineitem where l_comment='dolites wake'" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_24' query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 15000 and c_nationkey = 5 or c_custkey = c_nationkey * c_nationkey or c_nationkey >= 10 or c_acctbal <= 500""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_25' # query = "select o_orderkey, o_orderstatus, o_totalprice, o_comment from orders where o_orderstatus = 'O'" # runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_26' query = "select ((l_orderkey + l_partkey)/2)* l_quantity - l_suppkey*(l_discount - 1000.999) from lineitem" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_27' query = """select (l_orderkey + l_partkey*l_quantity)/2, (l_orderkey + l_quantity - l_partkey)*(l_orderkey + 3), l_orderkey/l_partkey - l_quantity, l_quantity*l_linenumber/l_partkey*l_tax from lineitem""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_28' query = """select EXTRACT(YEAR FROM l_receiptdate) - EXTRACT(YEAR FROM l_shipdate) as years_late, EXTRACT(MONTH FROM l_receiptdate) - EXTRACT(MONTH FROM l_shipdate) as months_late, EXTRACT(DAY FROM l_receiptdate) - EXTRACT(DAY FROM l_shipdate) as days_late from lineitem""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_29' query = """select l_quantity, sin(l_quantity), cos(l_quantity), asin(l_quantity), acos(l_quantity), ln(l_quantity), tan(l_quantity), atan(l_quantity), floor(l_quantity), ceil(l_quantity) from lineitem""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_30' query = "select o_totalprice, o_custkey + o_custkey from orders order by o_totalprice" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_31' query = "select o_orderkey, o_custkey, o_orderstatus from orders order by o_comment" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_32' query = "select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderkey, o_custkey, o_totalprice" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_33' query = """select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderstatus, o_shippriority, o_comment""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_34' query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1 desc, 2" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_35' query = """select custOrders.avgPrice, custOrders.numOrders from customer inner join (select o_custkey as o_custkey, avg(o_totalprice) as avgPrice, count(o_totalprice) as numOrders from orders where o_custkey <= 100 group by o_custkey) as custOrders on custOrders.o_custkey = customer.c_custkey where customer.c_nationkey <= 5""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_36' query = """with ordersTemp as ( select min(o_orderkey) as priorityKey, o_custkey from orders group by o_custkey ), ordersjoin as( select orders.o_custkey from orders inner join ordersTemp on ordersTemp.priorityKey = orders.o_orderkey ) select customer.c_custkey, customer.c_nationkey from customer inner join ordersjoin on ordersjoin.o_custkey = customer.c_custkey""" runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)
import time import pprint import cudf from blazingsql import BlazingContext from dask.distributed import Client from pyhive import hive import os import subprocess # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() authority = 'localhost:54310' hdfs_host = 'localhost' hdfs_port = 54310 hdfs_driver = 'libhdfs' result, error_msg, fs = bc.hdfs(authority, host=hdfs_host, port=hdfs_port, user='******', driver=hdfs_driver) cursor = hive.connect('localhost').cursor() table = bc.create_table('ptransactions', cursor, file_format='parquet') for i in range(11): query = "SELECT * FROM ptransactions where t_year=2017 and t_company_id={t_company_id} LIMIT 10".format( t_company_id=i)
from blazingsql import BlazingContext bc = BlazingContext() def main() : """ main(): parameters: ----------- None description: ------------ Creates a connection to the GPU underneath and holds it in "bc". Next, we access the "colab" environment S3 bucket (which holds a parquet file) and creates a table named "taxi" with it. Finally, this example ends off by saying that we wish to understand how BSQL is planning to extract the data rather than actually extracting it. result: ------- 'LogicalSort(fetch=[2]) BindableTableScan(table=[[main, taxi]], projects=[[3, 4]], aliases=[[passenger_count, trip_distance]]) ' """ global bc bc.s3('blazingsql-colab', bucket_name='blazingsql-colab') bc.create_table('taxi', 's3://blazingsql-colab/yellow_taxi/taxi_data.parquet') query = 'SELECT passenger_count, trip_distance FROM taxi LIMIT 2'
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) path_parquet = dir_data_fs + 'simple.parquet' print(path_parquet) bc.create_table('simple', path_parquet) # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = """select * from simple
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client from collections import OrderedDict from blazingsql import BlazingContext client = Client('127.0.0.1:8786') bc = BlazingContext(dask_client=client, network_interface="lo") bc.create_table('customer', '/home/aocsa/tpch/100MB2Part/tpch/customer_0_0.parquet') bc.create_table('orders', '/home/aocsa/tpch/100MB2Part/tpch/orders_*.parquet') algebra = """LogicalProject(c_custkey=[$0], c_nationkey=[$3]) LogicalFilter(condition=[<($0, 10)]) LogicalTableScan(table=[[main, customer]]) """ plan = """ { 'expr': 'LogicalProject(c_custkey=[$0], c_nationkey=[$3])', 'children': [ { 'expr': 'LogicalFilter(condition=[<($0, 10)])', 'children': [ { 'expr': 'LogicalTableScan(table=[[main, customer]])', 'children': [] }
def main(): install("gitpython") if not os.path.exists(conda_prefix + "/blazingsql-testing-files/"): git_clone() else: git_pull() unzip() queryType = ' Local Tests ' if len(sys.argv) == 2: n_nodos = sys.argv[1] else: n_nodos = "1" if n_nodos == "1": print("Executing test in Single Node") bc = BlazingContext() else: print("Executing test in Distributed Mode") from dask.distributed import Client client = Client('127.0.0.1:8786') print("Dask client ready!") bc = BlazingContext(dask_client=client, network_interface='lo') log_dict = {} def executionLocalTest(queryType): #Read Data TPCH------------------------------------------------------------------------------------------------------------ tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders', 'part', 'partsupp' ] data_types = [DataType.PARQUET] # TODO json for fileSchemaType in data_types: create_tables(bc, data_dir, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------------------------------- worder = 1 # Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_02' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_03' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_04' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_05' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_06' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_07' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result executionLocalTest(queryType) queryType = ' S3 Tests ' def executionS3Test(queryType): #Read Data TPCH------------------------------------------------------------------------------------------------------------ authority = "tpch_s3" print(authority) print(bucket_name) hola = S3EncryptionType.NONE print(hola) print(access_key_id) print(secret_key) bc.s3(authority, bucket_name=bucket_name, encryption_type=S3EncryptionType.NONE, access_key_id=access_key_id, secret_key=secret_key) dir_data_lc = "s3://" + authority + "/" + "DataSet100Mb2part/" tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders', 'part', 'partsupp' ] data_types = [DataType.PARQUET] # TODO json for fileSchemaType in data_types: create_tables(bc, data_dir, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------------------------------- worder = 1 # Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_08' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_09' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_11' #print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_13' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_14' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result executionS3Test(queryType) queryType = ' GS Tests ' def executionGSTest(queryType): authority = "tpch_gs" bc.gs(authority, project_id=gs_project_id, bucket_name=gs_bucket_name, use_default_adc_json_file=True, adc_json_file='') dir_data_lc = 'gcs://' + authority + '/100MB2Part/' tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders', 'part', 'partsupp' ] data_types = [DataType.PARQUET] for fileSchemaType in data_types: create_tables(bc, data_dir, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------------------------------- worder = 1 # Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_15' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_16' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_18' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_19' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_20' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_21' print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_22' #print("Executing " + queryId + " ... ") query = tpch.get_tpch_query(queryId) #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) executionGSTest(queryType) green = bcolors.OKGREEN endc = bcolors.ENDC print(green + "=======================================") print("SUMMARY TESTS") print("=======================================" + endc) for key, value in log_dict.items(): print('"{}" : {} '.format(key, value)) print(green + "=======================================" + endc)
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client client = Client('127.0.0.1:8786') client.restart() bc = BlazingContext(dask_client=client, network_interface="lo") # bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/100MB2Part/' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) bc.create_table('customer', dir_data_fs + '/customer_*.parquet') # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """ query = "select sum(c_custkey)/count(c_custkey), min(c_custkey) from customer limit 5" # [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment'] lp = bc.explain(query)
import time import pprint from blazingsql import BlazingContext from dask.distributed import Client # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() dir_data_fs = '/home/aocsa/tpch/DataSet5Part100MB' nfiles = 4 # bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet']) bc.create_table('customer', [ dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet', dir_data_fs + '/customer_3_0.parquet', dir_data_fs + '/customer_4_0.parquet' ]) # "BindableTableScan(table=[[main, customer]], # filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]], # projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])" # query = """select c_custkey, c_nationkey, c_acctbal # from # customer # where # c_custkey > 2990 and c_custkey < 3010 # """
def attach_to_cluster(config, create_blazing_context=False): """Attaches to an existing cluster if available. By default, tries to attach to a cluster running on localhost:8786 (dask's default). This is currently hardcoded to assume the dashboard is running on port 8787. Optionally, this will also create a BlazingContext. """ host = config.get("cluster_host") port = config.get("cluster_port", "8786") if host is not None: try: content = requests.get( "http://" + host + ":8787/info/main/workers.html").content.decode("utf-8") url = content.split("Scheduler ")[1].split(":" + str(port))[0] client = Client(address=f"{url}:{port}") print(f"Connected to {url}:{port}") except requests.exceptions.ConnectionError as e: sys.exit( f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}" ) except OSError as e: sys.exit(f"Unable to create a Dask Client connection: {e}") else: raise ValueError("Must pass a cluster address to the host argument.") def maybe_create_worker_directories(dask_worker): worker_dir = dask_worker.local_directory if not os.path.exists(worker_dir): os.mkdir(worker_dir) client.run(maybe_create_worker_directories) # Get ucx config variables ucx_config = client.submit(_get_ucx_config).result() config.update(ucx_config) # Save worker information gpu_sizes = ["16GB", "32GB", "40GB"] worker_counts = worker_count_info(client, gpu_sizes=gpu_sizes) for size in gpu_sizes: key = size + "_workers" if config.get( key) is not None and config.get(key) != worker_counts[size]: print( f"Expected {config.get(key)} {size} workers in your cluster, but got {worker_counts[size]}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts." ) sys.exit(-1) config["16GB_workers"] = worker_counts["16GB"] config["32GB_workers"] = worker_counts["32GB"] config["40GB_workers"] = worker_counts["40GB"] bc = None if create_blazing_context: bc = BlazingContext( dask_client=client, pool=os.environ.get("BLAZING_POOL", False), network_interface=os.environ.get("INTERFACE", "ib0"), config_options=get_config_options(), allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "managed"), initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE", None)) return client, bc
def attach_to_cluster(config, create_blazing_context=False): """Attaches to an existing cluster if available. By default, tries to attach to a cluster running on localhost:8786 (dask's default). This is currently hardcoded to assume the dashboard is running on port 8787. Optionally, this will also create a BlazingContext. """ scheduler_file = config.get("scheduler_file_path") host = config.get("cluster_host") port = config.get("cluster_port", "8786") if scheduler_file is not None: try: with open(scheduler_file) as fp: print(fp.read()) client = Client(scheduler_file=scheduler_file) print('Connected!') except OSError as e: sys.exit(f"Unable to create a Dask Client connection: {e}") elif host is not None: try: content = requests.get( "http://" + host + ":8787/info/main/workers.html").content.decode("utf-8") url = content.split("Scheduler ")[1].split(":" + str(port))[0] client = Client(address=f"{url}:{port}") print(f"Connected to {url}:{port}") config["protocol"] = str(url)[0:3] except requests.exceptions.ConnectionError as e: sys.exit( f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}" ) except OSError as e: sys.exit(f"Unable to create a Dask Client connection: {e}") else: raise ValueError( "Must pass a scheduler file or cluster address to the host argument." ) def maybe_create_worker_directories(dask_worker): worker_dir = dask_worker.local_directory if not os.path.exists(worker_dir): os.mkdir(worker_dir) client.run(maybe_create_worker_directories) # Get ucx config variables ucx_config = client.submit(_get_ucx_config).result() config.update(ucx_config) # Save worker information # Assumes all GPUs are the same size expected_workers = config.get("num_workers") worker_counts = worker_count_info(client) for gpu_size, count in worker_counts.items(): if count != 0: current_workers = worker_counts.pop(gpu_size) break if expected_workers is not None and expected_workers != current_workers: print( f"Expected {expected_workers} {gpu_size} workers in your cluster, but got {current_workers}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts." ) sys.exit(-1) config["16GB_workers"] = worker_counts.get("16GB", 0) config["32GB_workers"] = worker_counts.get("32GB", 0) config["40GB_workers"] = worker_counts.get("40GB", 0) bc = None if create_blazing_context: from blazingsql import BlazingContext bc = BlazingContext( dask_client=client, pool=os.environ.get("BLAZING_POOL", False), network_interface=os.environ.get("INTERFACE", "ib0"), config_options=get_bsql_config_options(), allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "managed"), initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE", None)) return client, bc
imp_sk, price_change, SUM( CASE WHEN ((ss_sold_date_sk >= c.imp_start_date) AND (ss_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ss_quantity ELSE 0 END) AS current_ss_quant, SUM( CASE WHEN ((ss_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ss_sold_date_sk < c.imp_start_date)) THEN ss_quantity ELSE 0 END) AS prev_ss_quant FROM store_sales ss JOIN temp_table c ON c.i_item_sk = ss.ss_item_sk GROUP BY ss_item_sk, imp_sk, price_change ) ss ON (ws.ws_item_sk = ss.ss_item_sk and ws.imp_sk = ss.imp_sk) GROUP BY ws.ws_item_sk """ result = bc.sql(query) return result if __name__ == "__main__": client = attach_to_cluster(cli_args) bc = BlazingContext( dask_client=client, pool=True, network_interface=os.environ.get("INTERFACE", "eth0"), ) result_df = main(cli_args["data_dir"]) write_result( result_df, output_directory=cli_args["output_dir"], )
def main(): #gitpython install("gitpython") #install("tarfile") if not os.path.exists(conda_prefix + "/blazingsql-testing-files/"): git_clone() else: git_pull() unzip() queryType = ' Local Tests ' bc = BlazingContext() log_dict = {} def executionLocalTest(queryType): #Read Data TPCH------------------------------------------------------------------------------------------------------------ tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders', 'part', 'partsupp' ] data_types = [DataType.PARQUET] # TODO json for fileSchemaType in data_types: create_tables(bc, data_dir, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------------------------------- worder = 1 # Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' print("Executing " + queryId + " ... ") query = """ select l_returnflag, l_linestatus, sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, sum(l_extendedprice*(1-l_discount)) as sum_disc_price, sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge, sum(l_quantity)/count(l_quantity) as avg_qty, sum(l_extendedprice)/count(l_extendedprice) as avg_price, sum(l_discount)/count(l_discount) as avg_disc, count(*) as count_order from lineitem where cast(l_shipdate as date) <= date '1998-09-01' group by l_returnflag, l_linestatus order by l_returnflag, l_linestatus""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_02' print("Executing " + queryId + " ... ") query = """ select s.s_acctbal, s.s_name, n.n_name, p.p_partkey, p.p_mfgr, s.s_address, s.s_phone, s.s_comment from supplier as s INNER JOIN nation as n ON s.s_nationkey = n.n_nationkey INNER JOIN partsupp as ps ON s.s_suppkey = ps.ps_suppkey INNER JOIN part as p ON p.p_partkey = ps.ps_partkey INNER JOIN region as r ON r.r_regionkey = n.n_regionkey where r.r_name = 'EUROPE' and p.p_size = 15 and p.p_type like '%BRASS' and ps.ps_supplycost = ( select min(psq.ps_supplycost) from partsupp as psq INNER JOIN supplier as sq ON sq.s_suppkey = psq.ps_suppkey INNER JOIN nation as nq ON nq.n_nationkey = sq.s_nationkey INNER JOIN region as rq ON rq.r_regionkey = nq.n_regionkey where rq.r_name = 'EUROPE' ) order by s.s_acctbal desc, n.n_name, s.s_name, p.p_partkey""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_03' print("Executing " + queryId + " ... ") query = """ select l.l_orderkey, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, o.o_orderdate, o.o_shippriority from orders as o INNER JOIN lineitem as l ON l.l_orderkey = o.o_orderkey INNER JOIN customer as c ON c.c_custkey = o.o_custkey where c.c_mktsegment = 'BUILDING' and o.o_orderdate < date '1995-03-15' and l.l_shipdate > date '1995-03-15' group by l.l_orderkey, o.o_orderdate, o.o_shippriority order by revenue desc, o.o_orderdate""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_04' #print("Executing " + queryId + " ... ") query = """ select o_orderpriority, count(*) as order_count from orders where o_orderdate >= date '1993-07-01' and o_orderdate < date '1994-10-01' and exists (select * from lineitem where l_orderkey = o_orderkey and l_commitdate < l_receiptdate) group by o_orderpriority order by o_orderpriority""" #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_05' print("Executing " + queryId + " ... ") query = """ select n.n_name, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue from orders o inner join lineitem l on l.l_orderkey = o.o_orderkey inner join customer c on o.o_custkey = c.c_custkey inner join supplier s on l.l_suppkey = s.s_suppkey and c.c_nationkey = s.s_nationkey inner join nation n on n.n_nationkey = s.s_nationkey inner join region r on n.n_regionkey = r.r_regionkey where r.r_name = 'ASIA' and o.o_orderdate >= date '1994-01-01' and o.o_orderdate < date '1995-01-01' group by n.n_name order by revenue desc""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_06' print("Executing " + queryId + " ... ") query = """ select sum(l_extendedprice*l_discount) as revenue from lineitem where l_shipdate >= date '1994-01-01' and l_shipdate < date '1995-01-01' and l_discount between 0.05 and 0.07 and l_quantity < 24""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_07' print("Executing " + queryId + " ... ") query = """ select supp_nation, cust_nation, l_year, sum(volume) as revenue from (select n1.n_name as supp_nation, n2.n_name as cust_nation, extract(year from l.l_shipdate) as l_year, l.l_extendedprice * (1 - l.l_discount) as volume from nation as n1 INNER JOIN supplier as s ON s.s_nationkey = n1.n_nationkey INNER JOIN lineitem as l ON l.l_suppkey = s.s_suppkey INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey INNER JOIN customer as c ON c.c_custkey = o.o_custkey INNER JOIN nation as n2 ON n2.n_nationkey = c.c_nationkey where ( (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') ) and cast(l.l_shipdate as date) between date '1995-01-01' and date '1996-12-31') as shipping group by supp_nation, cust_nation, l_year order by supp_nation, cust_nation, l_year """ result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result executionLocalTest(queryType) queryType = ' S3 Tests ' def executionS3Test(queryType): #Read Data TPCH------------------------------------------------------------------------------------------------------------ authority = "tpch_s3" bc.s3(authority, bucket_name=bucket_name, encryption_type=S3EncryptionType.NONE, access_key_id=access_key_id, secret_key=secret_key) dir_data_lc = "s3://" + authority + "/" + "DataSet100Mb2part/" tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders', 'part', 'partsupp' ] data_types = [DataType.PARQUET] # TODO json for fileSchemaType in data_types: create_tables(bc, data_dir, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------------------------------- worder = 1 # Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_08' print("Executing " + queryId + " ... ") query = """ select o_year, sum(case when nationl = 'BRAZIL' then volume else 0 end) / sum(volume) as mkt_share from ( select extract(year from o.o_orderdate) as o_year, l.l_extendedprice * (1 - l.l_discount) as volume, n2.n_name as nationl from part as p INNER JOIN lineitem as l ON p.p_partkey = l.l_partkey INNER JOIN supplier as s ON s.s_suppkey = l.l_suppkey INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey INNER JOIN customer as c ON c.c_custkey = o.o_custkey INNER JOIN nation as n1 ON n1.n_nationkey = c.c_nationkey INNER JOIN region as r ON r.r_regionkey = n1.n_regionkey INNER JOIN nation as n2 ON n2.n_nationkey = s.s_nationkey where r.r_name = 'AMERICA' and o.o_orderdate >= date '1995-01-01' and o.o_orderdate <= date '1996-12-31' and p.p_type = 'ECONOMY ANODIZED STEEL' ) as all_nations group by o_year order by o_year""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_09' #print("Executing " + queryId + " ... ") query = """ select nationl, o_year, sum(amount) as sum_profit from ( select n_name as nationl, extract(year from o_orderdate) as o_year, l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount from lineitem INNER JOIN orders ON o_orderkey = l_orderkey INNER JOIN partsupp ON ps_suppkey = l_suppkey and ps_partkey = l_partkey INNER JOIN part ON p_partkey = l_partkey INNER JOIN supplier ON s_suppkey = l_suppkey INNER JOIN nation ON n_nationkey = s_nationkey where p_name like '%green%' ) as profit group by nationl, o_year order by nationl, o_year desc""" #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' print("Executing " + queryId + " ... ") query = """ select c.c_custkey, c.c_name, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, c.c_acctbal, n.n_name, c.c_address, c.c_phone, c.c_comment from customer as c INNER JOIN nation as n ON n.n_nationkey = c.c_nationkey INNER JOIN orders as o ON o.o_custkey = c.c_custkey INNER JOIN lineitem as l ON l.l_orderkey = o.o_orderkey where o.o_orderdate >= date '1993-10-01' and o.o_orderdate < date '1994-10-01' and l.l_returnflag = 'R' group by c.c_custkey, c.c_name, c.c_acctbal, c.c_phone, n.n_name, c.c_address, c.c_comment""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_11' #print("Executing " + queryId + " ... ") query = """ select ps.ps_partkey, sum(ps.ps_supplycost * ps.ps_availqty) as valuep from partsupp as ps INNER JOIN supplier as s ON ps.ps_suppkey = s.s_suppkey INNER JOIN nation as n ON s.s_nationkey = n.n_nationkey where n.n_name = 'GERMANY' group by ps.ps_partkey having sum(ps.ps_supplycost * ps.ps_availqty) > ( select sum(psq.ps_supplycost) from partsupp as psq INNER JOIN supplier as sq ON psq.ps_suppkey = sq.s_suppkey INNER JOIN nation as nq ON sq.s_nationkey = nq.n_nationkey where nq.n_name = 'GERMANY' ) order by valuep desc""" #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' print("Executing " + queryId + " ... ") query = """ select l.l_shipmode, sum(case when o.o_orderpriority ='1-URGENT' or o.o_orderpriority ='2-HIGH' then 1 else 0 end) as high_line_count, sum(case when o.o_orderpriority <> '1-URGENT' and o.o_orderpriority <> '2-HIGH' then 1 else 0 end) as low_line_count from lineitem as l INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey where l.l_shipmode in ('MAIL', 'SHIP') and l.l_commitdate < l.l_receiptdate and l.l_shipdate < l.l_commitdate and l.l_receiptdate >= date '1994-01-01' and l.l_receiptdate < date '1995-01-01' group by l.l_shipmode order by l.l_shipmode""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_13' print("Executing " + queryId + " ... ") query = """ select c_count, count(*) as custdist from (select c.c_custkey, count(o.o_orderkey) from customer as c LEFT OUTER JOIN orders as o ON c.c_custkey = o.o_custkey where o.o_comment not like '%special%requests%' group by c.c_custkey) as c_orders (c_custkey, c_count) group by c_count order by custdist desc, c_count desc""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_14' print("Executing " + queryId + " ... ") query = """ select 100.00 * sum( case when p.p_type like 'PROMO%' then l.l_extendedprice * (1 - l.l_discount) else 0 end) / sum(l.l_extendedprice * (1 - l.l_discount) ) as promo_revenue from lineitem as l INNER JOIN part as p ON p.p_partkey = l.l_partkey where l.l_shipdate >= date '1995-09-01' and l.l_shipdate < date '1995-10-01'""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result executionS3Test(queryType) queryType = ' GS Tests ' def executionGSTest(queryType): authority = "tpch_gs" bc.gs(authority, project_id=gs_project_id, bucket_name=gs_bucket_name, use_default_adc_json_file=True, adc_json_file='') dir_data_lc = 'gcs://' + authority + '/100MB2Part/' tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders', 'part', 'partsupp' ] data_types = [DataType.PARQUET] for fileSchemaType in data_types: create_tables(bc, data_dir, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------------------------------- worder = 1 # Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_15' print("Executing " + queryId + " ... ") query = """with revenue (suplier_no, total_revenue) as ( select l_suppkey, cast(sum(l_extendedprice * (1-l_discount)) as int) from lineitem where l_shipdate >= date '1996-01-01' and l_shipdate < date '1996-04-01' group by l_suppkey ) select s.s_suppkey, s.s_name, s.s_address, s.s_phone, re.total_revenue from supplier as s INNER JOIN revenue as re on s.s_suppkey = re.suplier_no where re.total_revenue = cast((select max(total_revenue) from revenue) as int) order by s.s_suppkey""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_16' #print("Executing " + queryId + " ... ") query = """ select p.p_brand, p.p_type, p.p_size, count(distinct ps.ps_suppkey) as supplier_cnt from partsupp ps inner join part p on p.p_partkey = ps.ps_partkey where p.p_brand <> 'Brand#45' and p.p_type not like 'MEDIUM POLISHED%' and p.p_size in (49, 14, 23, 45, 19, 3, 36, 9) and ps.ps_suppkey not in (select s.s_suppkey from supplier s where s.s_comment like '%Customer%Complaints%') group by p.p_brand, p.p_type, p.p_size order by supplier_cnt desc, p.p_brand, p.p_type, p.p_size""" #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' print("Executing " + queryId + " ... ") query = """ select sum(l1.l_extendedprice) / 7.0 as avg_yearly from lineitem l1 inner join part p on p.p_partkey = l1.l_partkey where p.p_brand = 'Brand#23' and p.p_container = 'MED BOX' and l1.l_quantity < (select 0.2 * avg(l2.l_quantity) from lineitem l2 where l2.l_partkey = p.p_partkey)""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_18' print("Executing " + queryId + " ... ") query = """ select c.c_name, c.c_custkey, o.o_orderkey, o.o_orderdate, o.o_totalprice, sum(l.l_quantity) from customer c inner join orders o on c.c_custkey = o.o_custkey inner join lineitem l on o.o_orderkey = l.l_orderkey where o.o_orderkey in (select l2.l_orderkey from lineitem l2 group by l2.l_orderkey having sum(l2.l_quantity) > 300) group by c.c_name, c.c_custkey, o.o_orderkey, o.o_orderdate, o.o_totalprice order by o.o_totalprice desc, o.o_orderdate""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_19' print("Executing " + queryId + " ... ") query = """ select sum(l.l_extendedprice * (1 - l.l_discount) ) as revenue from lineitem as l INNER JOIN part as p ON l.l_partkey = p.p_partkey where ( p.p_brand = 'Brand#12' and p.p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') and l.l_quantity >= 1 and l.l_quantity <= 11 and p.p_size between 1 and 5 and l.l_shipmode in ('AIR', 'AIR REG') and l.l_shipinstruct = 'DELIVER IN PERSON' ) or ( p.p_brand = 'Brand#23' and p.p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') and l.l_quantity >= 10 and l.l_quantity <= 20 and p.p_size between 1 and 10 and l.l_shipmode in ('AIR', 'AIR REG') and l.l_shipinstruct = 'DELIVER IN PERSON' ) or ( p.p_brand = 'Brand#34' and p.p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') and l.l_quantity >= 20 and l.l_quantity <= 30 and p.p_size between 1 and 15 and l.l_shipmode in ('AIR', 'AIR REG') and l.l_shipinstruct = 'DELIVER IN PERSON' )""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_20' print("Executing " + queryId + " ... ") query = """ select s.s_name, s.s_address from supplier s inner join nation n on s.s_nationkey = n.n_nationkey where s.s_suppkey in (select ps.ps_suppkey from partsupp ps where ps.ps_partkey in (select p.p_partkey from part p where p.p_name like 'forest%') and ps_availqty > (select 0.5 * sum(l.l_quantity) from lineitem l where l.l_partkey = ps.ps_partkey and l.l_suppkey = ps.ps_suppkey and l.l_shipdate >= date '1994-01-01' and l.l_shipdate < date '1995-01-01')) and n.n_name = 'CANADA' order by s.s_name""" result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_21' print("Executing " + queryId + " ... ") query = """ select s_name, count(*) as numwait from supplier inner join lineitem l1 on s_suppkey = l1.l_suppkey inner join orders on o_orderkey = l1.l_orderkey inner join nation on s_nationkey = n_nationkey where o_orderstatus = 'F' and l1.l_receiptdate > l1.l_commitdate and exists (select * from lineitem l2 where l2.l_orderkey = l1.l_orderkey and l2.l_suppkey <> l1.l_suppkey) and not exists (select * from lineitem l3 where l3.l_orderkey = l1.l_orderkey and l3.l_suppkey <> l1.l_suppkey and l3.l_receiptdate > l3.l_commitdate) and n_name = ' SAUDI ARABIA' group by s_name order by numwait desc, s_name""" result = run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) log_dict[queryId] = result queryId = 'TEST_22' #print("Executing " + queryId + " ... ") query = """ select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal from (select substring(c_phone from 1 for 2) as cntrycode, c_acctbal from customer where substring(c_phone from 1 for 2) in ('13','31','23','29','30','18','17') and c_acctbal > (select avg(c_acctbal) from customer where c_acctbal > 0.00 and substring (c_phone from 1 for 2) in ('13','31','23','29','30','18','17')) and not exists (select * from orders where o_custkey = c_custkey)) as custsale group by cntrycode order by cntrycode""" #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) executionGSTest(queryType) green = bcolors.OKGREEN endc = bcolors.ENDC print(green + "=======================================") print("SUMMARY TESTS") print("=======================================" + endc) for key, value in log_dict.items(): print('"{}" : {} '.format(key, value)) print(green + "=======================================" + endc)