def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["nation", "bool_orders", "region", "customer"] bool_orders_index = tables.index("bool_orders") data_types = [DataType.CSV] # TODO ORC gdf parquet json #Create Tables ------------------------------------------------------------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables, bool_orders_index=bool_orders_index) #Set up util params------------------------------------------------------------------------------------------------------- worder = 1 use_percentage = False non_aceptable_diff = 0 min_aceptable_diff = 0.01 print('==============================') print(queryType) print('==============================') # queryId = 'TEST_00' # query = "select 1 from nation" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', non_aceptable_diff, use_percentage, fileSchemaType) # # queryId = 'TEST_0A' # query = "select *, 2 from nation" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', non_aceptable_diff, use_percentage, fileSchemaType) # #Run Queries ------------------------------------------------------------------------------------------------------------- queryId = 'TEST_01' query = "select * from bool_orders order by o_orderkey, o_custkey limit 300" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', min_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select o_custkey, o_confirmed from bool_orders where o_confirmed is null limit 30" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', non_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_03' query = """select o_custkey, 0.75 * o_totalprice, o_confirmed from bool_orders where o_confirmed = true order by o_custkey, o_orderKey limit 20""" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', min_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_04' query = """select o_custkey, o_confirmed from bool_orders where o_confirmed is not NULL order by o_custkey, o_confirmed desc limit 25""" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', non_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_05' query = "select o_custkey, 0.95 * o_totalprice, o_confirmed from bool_orders where o_confirmed is null" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', min_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_06' query = "select o_custkey, 0.75 * o_totalprice as proffit from bool_orders where o_custkey < 300 and o_confirmed = False" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', min_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_07' query = """select o_custkey, 0.75 * o_totalprice as proffit from bool_orders where o_custkey < 300 and o_confirmed is not NULL order by o_custkey, o_orderkey limit 200""" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', min_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_08' # when count() and o_confirmed is null (o_confirmed is not null) are mixed then fails query = "select count(o_orderstatus) from bool_orders where o_orderstatus <> 'O' and o_confirmed = true" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', non_aceptable_diff, use_percentage, fileSchemaType) queryId = 'TEST_09' query = "select sum(o_orderkey)/count(o_orderkey), max(o_totalprice) from bool_orders where o_confirmed = true and o_orderkey < 2500" runTest.run_query( bc, drill, query, queryId, queryType, worder, '', min_aceptable_diff, True, fileSchemaType) #TODO: Change sum/count for avg KC queryId = 'TEST_10' query = "select sum(o_orderkey)/count(o_orderkey), max(o_totalprice) from bool_orders where o_confirmed IS NULL and o_orderkey < 2500" #runTest.run_query(bc, drill, query, queryId, queryType, worder, '', min_aceptable_diff, True) #TODO: Change sum/count for avg KC queryId = 'TEST_11' query = """select o_custkey, min(o_totalprice) from bool_orders where o_custkey < 300 and o_confirmed is not NULL group by o_custkey""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', min_aceptable_diff, use_percentage, fileSchemaType) # queryId = 'TEST_12' # query = """select count(o_custkey), max(o_totalprice), min(o_totalprice), sum(o_totalprice)/count(o_totalprice) # from bool_orders group by o_custkey""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', min_aceptable_diff, True) #TODO: Change sum/count for avg KC # # queryId = 'TEST_13' # query = """select count(o_custkey), o_orderkey, o_confirmed from bool_orders # where o_orderkey < 100 and o_confirmed = FALSE group by o_orderkey, (o_orderkey + o_custkey), o_confirmed""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, "o_orderkey", non_aceptable_diff, use_percentage, fileSchemaType) # # queryId = 'TEST_14' # query = "select o.o_custkey, c.c_name as customer_name from bool_orders as o inner join customer as c on c.c_custkey = o.o_custkey where o.o_confirmed = False and o.o_orderstatus <> 'O'" # runTest.run_query(bc, drill, query, queryId, queryType, worder, "o_custkey", non_aceptable_diff, use_percentage, fileSchemaType) # # queryId = 'TEST_15' # query = "select o.o_custkey, c.c_name, n.n_regionkey from nation as n inner join customer as c on n.n_nationkey = c.c_nationkey inner join bool_orders as o on c.c_custkey = o.o_custkey where o.o_confirmed = False and o.o_orderstatus <> 'O'" # runTest.run_query(bc, drill, query, queryId, queryType, worder, "o_custkey", non_aceptable_diff, use_percentage, fileSchemaType) # # queryId = 'TEST_16' # query = "select o.o_custkey, c.c_name, n.n_regionkey from customer as c inner join bool_orders as o on c.c_custkey = o.o_custkey inner join nation as n on n.n_nationkey = c.c_nationkey where o.o_confirmed = false and o.o_orderstatus <> 'O'" # runTest.run_query(bc, drill, query, queryId, queryType, worder, "o_custkey", non_aceptable_diff, use_percentage, fileSchemaType) # # queryId = 'TEST_17' # query = """select count(o_custkey), max(o_totalprice), min(o_totalprice), sum(o_totalprice)/count(o_totalprice) # from bool_orders group by o_totalprice, o_custkey, o_orderkey order by o_totalprice, o_custkey, o_orderkey desc""" # runTest.run_query(bc, drill, query, queryId, queryType, 0, '', min_aceptable_diff, True) #TODO: Change sum/count for avg KC if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "Round" def executionTest(queryType): tables = ["nation", "region", "customer", "orders", "lineitem"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0 print("==============================") print(queryType + " Tests") print("==============================") queryId = "TEST_01" query = """select ROUND(orders.o_orderkey), ROUND(orders.o_totalprice) from customer left outer join orders on customer.c_custkey = orders.o_custkey where customer.c_nationkey = 3 and customer.c_custkey < 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select ROUND(orders.o_totalprice, 2), ROUND(orders.o_totalprice, -2) from customer left outer join orders on customer.c_custkey = orders.o_custkey where customer.c_nationkey = 3 and customer.c_custkey < 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select customer.c_custkey, orders.o_orderkey, ROUND(orders.o_custkey,0) from customer left outer join orders on customer.c_custkey = orders.o_custkey where customer.c_nationkey = 3 and customer.c_custkey < 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select MAX(ROUND(n1.n_regionkey,3)) from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 """ runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # WSM NEED TO REVISIT THIS queryId = "TEST_05" query = "select ROUND(AVG(o_totalprice)) from orders" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = 'Substring' def executionTest(): tables = ["partsupp", "customer", "nation"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET ] # TODO json #Create Tables ------------------------------------------------------------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = """select SUBSTRING(CAST(ps_partkey as VARCHAR),1,1), ps_availqty from partsupp where ps_availqty > 7000 and ps_supplycost > 700 order by ps_partkey, ps_availqty limit 50""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = """select c_custkey, c_name from customer where SUBSTRING(c_name,1,17) = 'Customer#00000000'""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = """select c_custkey, SUBSTRING(c_name, 1, 8) from customer where c_name = 'Customer#000000009'""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = """select * from nation where SUBSTRING(n_name,1,1) = 'I'""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_05' query = """select c_custkey, c_name, SUBSTRING(c_name,1,1), SUBSTRING(c_name,2,1), SUBSTRING(c_name,1,2), SUBSTRING(c_name,2,2) from customer where c_custkey < 20""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = """select c.c_custkey, SUBSTRING(c.c_name, 10, 18), CAST(SUBSTRING(c.c_name, 10, 18) as INT), CAST(SUBSTRING(c.c_name, 10, 18) as INT) + 1 from customer c where c.c_custkey < 50""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = """select c.c_custkey, SUBSTRING(c.c_name, 1, 8), SUBSTRING(c.c_name, 10, 18) || '**' from customer c where c.c_custkey < 0""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = """select * from ( select c.c_custkey, SUBSTRING(c.c_name, 1, 8) as n1, SUBSTRING(c.c_name, 10, 18) || '**' as n2 from customer c where c.c_custkey < 50 ) as n where SUBSTRING(n.n1, 1,7) = 'Customer'""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["orders", "customer"] data_types = [DataType.ORC] # TODO gdf csv parquet json for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select 2, o_orderdate from orders order by o_orderdate asc limit 5""" query_spark = """select 2, o_orderdate from orders order by o_orderdate nulls last limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "o_orderdate", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_02" query = """select 'Rommel',c_name from customer order by c_name limit 5""" query_spark = """select 'Rommel',c_name from customer order by c_name nulls last limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "c_name", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_03" query = """select '1990-01-01', c_custkey from customer order by c_custkey limit 5""" query_spark = """select '1990-01-01', c_custkey from customer order by c_custkey nulls last limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_04" query = """select timestamp '1990-01-01 00:00:00', c_custkey from customer order by c_custkey limit 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, True, fileSchemaType, ) # TODO: Change sum/count for avg KC queryId = "TEST_05" query = """select 80000 as constant, c_custkey from customer order by c_custkey limit 5""" query_spark = """select 80000 as constant, c_custkey from customer order by c_custkey nulls last limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_06" # TODO: Blazing not support: 2+2 query = """select 2+2, o_orderdate from orders order by o_orderkey limit 5""" # runTest.run_query(bc, spark, query, queryId, queryType, # 0, '', acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["orders", "nation", "lineitem", "customer", "region"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") # ------------ ROWS bounding ---------------- queryId = "TEST_01" query = """select min(n_nationkey) over ( order by n_name ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING ) min_val, n_nationkey, n_regionkey, n_name from nation""" runTest.run_query(bc, spark, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType) queryId = "TEST_02" query = """select min(o_orderkey) over ( order by o_totalprice ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING ) min_keys, max(o_orderkey) over ( order by o_totalprice ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING ) max_keys, o_orderkey, o_orderpriority from orders where o_orderpriority <> '2-HIGH' and o_clerk = 'Clerk#000000880' and o_orderstatus is not null and o_totalprice is not null order by o_orderstatus, o_totalprice limit 50""" runTest.run_query(bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType) queryId = "TEST_03" query = """with new_nation as ( select n.n_nationkey as n_natio1, n.n_name as n_nam1, n.n_regionkey as n_region1 from nation as n inner join region as r on n.n_nationkey = r.r_regionkey ) select avg(cast(nn.n_natio1 as double)) over ( order by nn.n_nam1 ROWS BETWEEN 3 PRECEDING AND 2 FOLLOWING ) avg_keys, nn.n_natio1, nn.n_nam1, nn.n_region1 from new_nation nn order by nn.n_natio1, avg_keys""" runTest.run_query(bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType) queryId = "TEST_04" query = """select max(l_partkey) over ( order by l_extendedprice desc, l_orderkey, l_quantity ROWS BETWEEN 6 PRECEDING AND 2 FOLLOWING ) max_keys, l_linestatus, l_extendedprice from lineitem where l_shipmode not in ('MAIL', 'SHIP', 'AIR') and l_linestatus = 'F' and l_extendedprice is not null order by l_extendedprice, l_orderkey, max_keys limit 50""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select l.l_orderkey, l.l_linenumber, l.l_suppkey, l.l_partkey, c.c_custkey, max(l.l_partkey) over ( order by l.l_orderkey, l.l_linenumber, l.l_suppkey ROWS BETWEEN 6 PRECEDING AND 2 FOLLOWING ) max_pkeys, max(c.c_custkey) over ( order by l.l_orderkey, l.l_linenumber, l.l_suppkey ROWS BETWEEN 6 PRECEDING AND 2 FOLLOWING ) max_cust from lineitem as l inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey where l.l_quantity = 1 and c.c_custkey < 10000 order by l.l_orderkey, l.l_linenumber, l.l_partkey, c.c_custkey """ runTest.run_query(bc, spark, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer", "orders", "nation", "lineitem"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue print(fileSchemaType) cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 15""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 150 and c_nationkey = 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >= 30""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 or c_nationkey >= 24""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >= 3""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 150 and c_nationkey = 5 and c_acctbal > 600""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey > 0 or c_nationkey >= 24 or c_acctbal > 700""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 150 and c_nationkey = 5 or c_custkey = 200 or c_nationkey >= 10 or c_acctbal <= 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and (c.c_custkey > 10 and c.c_custkey < 50)""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select * from nation where n_regionkey > 2 and n_regionkey < 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # Gives [RunExecuteGraph Error] std::exception and crashes with ORC # Related Issue: https://github.com/BlazingDB/blazingsql/issues/1324 # queryId = "TEST_12" # query = """select c_custkey, c_nationkey as nkey from customer # where -c_nationkey + c_acctbal > 750.3""" # runTest.run_query( # bc, # drill, # query, # queryId, # queryType, # worder, # "c_custkey", # acceptable_difference, # use_percentage, # fileSchemaType, # ) # Gives [RunExecuteGraph Error] std::exception and crashes with ORC # Related Issue: https://github.com/BlazingDB/blazingsql/issues/1324 # queryId = "TEST_13" # query = """select c_custkey, c_nationkey as nkey from customer # where -c_nationkey + c_acctbal > 750""" # runTest.run_query( # bc, # drill, # query, # queryId, # queryType, # worder, # "c_custkey", # acceptable_difference, # use_percentage, # fileSchemaType, # ) queryId = "TEST_14" query = """select c_custkey, c_name, c_acctbal from customer where c_custkey <> 10 and c_custkey <> 11 and c_custkey <> 100 and c_custkey <> 1000 and c_custkey < 1001""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = 'TEST_15' query = """select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag <> 'g packages.'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_16" query = """select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag='N' and l_linenumber < 3 and l_orderkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_17" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_custkey < 15000 and c_nationkey = 5 or c_custkey = c_nationkey * c_nationkey or c_nationkey >= 10 or c_acctbal <= 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer", "orders", "nation"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue print(fileSchemaType) cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select c_custkeynew, c_nationkey, c_acctbal from customer where c_custkey < 15""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, message_validation="Column 'c_custkeynew' not found in any table", ) queryId = "TEST_02" query = """select c_custkey, c_nationkey, c_acctbal from customer1 where c_custkey < 150 and c_nationkey = 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, message_validation="Object 'customer1' not found", ) queryId = "TEST_03" query = """select maxi(c_custkey), c_nationkey as nkey from customer where c_custkey < 0""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, message_validation="No match found for function signature maxi(<NUMERIC>)", ) queryId = "TEST_04" query = """select max(c_custkey) c_nationkey as nkey from customer where c_custkey < 0""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "c_custkey", acceptable_difference, use_percentage, fileSchemaType, message_validation="""SqlSyntaxException select max(c_custkey) c_nationkey as nkey ^^ from customer where c_custkey < 0 Encountered "as" at line 1, column 35. Was expecting one of: <EOF> "EXCEPT" ... "FETCH" ... "FROM" ... "INTERSECT" ... "LIMIT" ... "OFFSET" ... "ORDER" ... "MINUS" ... "UNION" ... "," ...""", ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): # TODO percy should be possible to make this test distributed if int(nRals) != 1: print(queryType + " will run only when nRals = 1") return start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer", "nation"] data_types = [ DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO parquet json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") # Adding plus 3 to a column queryId = "TEST_01" query = "select c_custkey, c_nationkey, c_acctbal from customer" query_blz = runTest.get_blazingsql_query("main", query) token = bc.sql(query_blz) temp_gdf = get_results(nRals, token) temp_gdf["c_custkey"] = temp_gdf["c_custkey"] + 3 bc.create_table("temp", temp_gdf) sql = "select * from main.temp" token = bc.sql(sql) result_gdf = get_results(nRals, token) query = """select c_custkey + 3, c_nationkey, c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf, print_result=True, ) # Dropping a column queryId = "TEST_02" query = "select c_custkey, c_nationkey, c_acctbal from customer" query_blz = runTest.get_blazingsql_query("main", query) token = bc.sql(query_blz) temp_gdf = get_results(nRals, token) temp_gdf.drop(columns=["c_custkey"], inplace=True) bc.create_table("temp2", temp_gdf) sql = "select * from main.temp2" token = bc.sql(sql) result_gdf = get_results(nRals, token) query = "select c_nationkey, c_acctbal from customer" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf, print_result=True, ) # Adding plus 3 to a column and dropping another queryId = "TEST_03" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_acctbal > 1000""" query_blz = runTest.get_blazingsql_query("main", query) token = bc.sql(query_blz) temp_gdf = get_results(nRals, token) temp_gdf["c_acctbal"] = temp_gdf["c_acctbal"] + 3 temp_gdf.drop(columns=["c_custkey"], inplace=True) bc.create_table("temp3", temp_gdf) sql = "select * from main.temp3" token = bc.sql(sql) result_gdf = get_results(nRals, token) query = ( """select c_nationkey, c_acctbal + 3 from customer where c_acctbal > 1000""" ) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf, print_result=True, ) # Reutilizing blazinsql results twice queryId = "TEST_04" query = """select c_custkey, c_nationkey, c_acctbal from customer where c_acctbal > 1000""" query_blz = runTest.get_blazingsql_query("main", query) token = bc.sql(query_blz) temp_gdf = get_results(nRals, token) temp_gdf["c_acctbal_new"] = temp_gdf["c_acctbal"] + 3 temp_gdf.drop(columns=["c_acctbal"], inplace=True) temp_gdf.drop(columns=["c_custkey"], inplace=True) bc.create_table("temp0", temp_gdf) sql = "select * from main.temp0" token = bc.sql(sql) temp2_gdf = get_results(nRals, token) temp2_gdf.drop(columns=["c_nationkey"], inplace=True) bc.create_table("temp4", temp2_gdf) sql = "select * from main.temp4" token = bc.sql(sql) result_gdf = get_results(nRals, token) query = """select c_acctbal + 3 as c_acctbal_new from customer where c_acctbal > 1000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf, print_result=True, ) # Adding another column queryId = "TEST_05" query = "select c_acctbal from customer where c_acctbal > 1000" query_blz = runTest.get_blazingsql_query("main", query) token = bc.sql(query_blz) temp_gdf = get_results(nRals, token) temp_gdf["c_acctbal_new"] = temp_gdf["c_acctbal"] + 3 bc.create_table("temp5", temp_gdf) sql = "select * from main.temp5" token = bc.sql(sql) result_gdf = get_results(nRals, token) query = """select c_acctbal, c_acctbal + 3 as c_acctbal_new from customer where c_acctbal > 1000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf, print_result=True, ) queryId = "TEST_06" token = bc.sql( """select n_nationkey, n_regionkey from nation where n_nationkey < 0""" ) result_gdf = get_results(nRals, token) bc.create_table("results", result_gdf) token1 = bc.sql( """select n.n_nationkey, r.n_regionkey from main.nation as n left join main.results as r on n.n_nationkey = r.n_nationkey""" ) result_gdf1 = get_results(nRals, token1) query = """select n.n_nationkey, r.n_regionkey from nation as n left join (select n_nationkey, n_regionkey from nation where n_nationkey < 0) as r on n.n_nationkey = r.n_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf1, ) queryId = "TEST_07" # Calling a cudf function that returns the same column token = bc.sql("select n_nationkey, n_regionkey from main.nation") result_gdf = get_results(nRals, token) result_gdf["n_nationkey"] = result_gdf[ "n_nationkey"].astype("int32") bc.create_table("results_tmp", result_gdf) token1 = bc.sql("select * from main.results_tmp") result_gdf1 = get_results(nRals, token1) query = "select n_nationkey, n_regionkey from nation" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, nested_query=True, blz_result=result_gdf1, print_result=True, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): if dask_client is not None: dask_client.close() dask_client.shutdown() del dask_client del bc # conf_opt_1 conf_opt_1 = {} conf_opt_1["JOIN_PARTITION_SIZE_THRESHOLD"] = 10 conf_opt_1["MAX_DATA_LOAD_CONCAT_CACHE_BYTE_SIZE"] = 10 conf_opt_1["MAX_KERNEL_RUN_THREADS"] = 5 # conf_opt_2 conf_opt_2 = {} conf_opt_2["JOIN_PARTITION_SIZE_THRESHOLD"] = 10 conf_opt_2["MAX_DATA_LOAD_CONCAT_CACHE_BYTE_SIZE"] = 10 conf_opt_2["MAX_KERNEL_RUN_THREADS"] = 1 conf_opt_2["ENABLE_GENERAL_ENGINE_LOGS"] = False conf_opt_2["ENABLE_COMMS_LOGS"] = True conf_opt_2["ENABLE_TASK_LOGS"] = False conf_opt_2["ENABLE_OTHER_ENGINE_LOGS"] = True # conf_opt_3 conf_opt_3 = {} conf_opt_3["MAX_NUM_ORDER_BY_PARTITIONS_PER_NODE"] = 1 conf_opt_3["NUM_BYTES_PER_ORDER_BY_PARTITION"] = 10 # conf_opt_4 conf_opt_4 = {} conf_opt_4["MAX_ORDER_BY_SAMPLES_PER_NODE"] = 5 conf_opt_4["MAX_SEND_MESSAGE_THREADS"] = 1 conf_opt_4["TRANSPORT_BUFFER_BYTE_SIZE"] = 10000 conf_opt_4["TRANSPORT_POOL_NUM_BUFFERS"] = 10000 conf_opt_4["ENABLE_GENERAL_ENGINE_LOGS"] = False conf_opt_4["ENABLE_COMMS_LOGS"] = False conf_opt_4["ENABLE_TASK_LOGS"] = True conf_opt_4["ENABLE_OTHER_ENGINE_LOGS"] = True # conf_opt_5 conf_opt_5 = {} conf_opt_5["MAX_ORDER_BY_SAMPLES_PER_NODE"] = 5000000 conf_opt_5["MAX_SEND_MESSAGE_THREADS"] = 200 conf_opt_5["TRANSPORT_BUFFER_BYTE_SIZE"] = 10000 conf_opt_5["TRANSPORT_POOL_NUM_BUFFERS"] = 10 # conf_opt_6 conf_opt_6 = {} conf_opt_6["MAX_ORDER_BY_SAMPLES_PER_NODE"] = 5 conf_opt_6["MAX_SEND_MESSAGE_THREADS"] = 200 conf_opt_6["TRANSPORT_BUFFER_BYTE_SIZE"] = 100000000 conf_opt_6["TRANSPORT_POOL_NUM_BUFFERS"] = 10 conf_opt_6["ENABLE_GENERAL_ENGINE_LOGS"] = True conf_opt_6["ENABLE_COMMS_LOGS"] = True conf_opt_6["ENABLE_TASK_LOGS"] = False conf_opt_6["ENABLE_OTHER_ENGINE_LOGS"] = False # conf_opt_7 conf_opt_7 = {} conf_opt_7["MAX_ORDER_BY_SAMPLES_PER_NODE"] = 5000000 conf_opt_7["MAX_SEND_MESSAGE_THREADS"] = 1 conf_opt_7["TRANSPORT_BUFFER_BYTE_SIZE"] = 100000000 conf_opt_7["TRANSPORT_POOL_NUM_BUFFERS"] = 100 # conf_opt_8 conf_opt_8 = {} conf_opt_8["BLAZING_DEVICE_MEM_CONSUMPTION_THRESHOLD"] = 0.0001 conf_opt_8["MEMORY_MONITOR_PERIOD"] = 5000000 conf_opt_8["ENABLE_GENERAL_ENGINE_LOGS"] = True conf_opt_8["ENABLE_COMMS_LOGS"] = False conf_opt_8["ENABLE_TASK_LOGS"] = True conf_opt_8["ENABLE_OTHER_ENGINE_LOGS"] = False # all sets all_set_list = [ conf_opt_1, conf_opt_2, conf_opt_3, conf_opt_4, conf_opt_5, conf_opt_6, conf_opt_7, conf_opt_8, ] start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "Config Options" def executionTest(queryType, setInd, config_options): bc, dask_client = init_context(config_options) tables = [ "nation", "region", "customer", "lineitem", "orders", "supplier", "part", "partsupp", ] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.PARQUET, ] # TODO orc, json # Create Tables ------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ worder = 1 # Parameter to indicate if its necessary to order # the resulsets before compare them use_percentage = False acceptable_difference = 0.001 print("==============================") print(queryType) print("Test set: " + str(setInd + 1) + " Options: " + str(config_options)) print("==============================") queryId = "TEST_01" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", 0.1, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_05" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_07" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_10" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = tpch.get_tpch_query(queryId) # runTest.run_query( # bc, # drill, # query, # queryId, # queryType, # worder, # "", # acceptable_difference, # use_percentage, # fileSchemaType, # ) queryId = "TEST_12" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_13" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_14" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_15" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_16" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_17" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_18" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_19" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_20" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_21" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_22" query = tpch.get_tpch_query(queryId) # runTest.run_query( # bc, # drill, # query, # queryId, # queryType, # worder, # "", # acceptable_difference, # use_percentage, # fileSchemaType, # ) if dask_client is not None: dask_client.run(gc.collect) dask_client.run_on_scheduler(gc.collect) dask_client.close() dask_client.shutdown() del dask_client del bc for setInd, config_options in enumerate(all_set_list): executionTest(queryType, setInd, config_options) gc.collect() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "TPCH Queries" def executionTest(queryType): tables = [ "nation", "region", "customer", "lineitem", "orders", "supplier", "part", "partsupp", ] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.PARQUET ] # TODO orc, json # Create Tables ------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ worder = 1 # Parameter to indicate if its necessary to order # the resulsets before compare them use_percentage = False acceptable_difference = 0.001 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", 0.1, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_05" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_07" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) queryId = "TEST_10" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = tpch.get_tpch_query(queryId) # runTest.run_query( # bc, # drill, # query, # queryId, # queryType, # worder, # "", # acceptable_difference, # use_percentage, # fileSchemaType, # ) queryId = "TEST_12" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_13" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_14" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_15" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType ) queryId = "TEST_16" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType ) queryId = "TEST_17" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_18" query = tpch.get_tpch_query(queryId) if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_19" query = tpch.get_tpch_query(queryId) runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) # queryId = "TEST_20" # query = tpch.get_tpch_query(queryId) # runTest.run_query( # bc, # spark, # query, # queryId, # queryType, # worder, # "", # acceptable_difference, # True, # fileSchemaType, # ) # queryId = "TEST_21" # query = tpch.get_tpch_query(queryId) # runTest.run_query( # bc, # spark, # query, # queryId, # queryType, # worder, # "", # acceptable_difference, # use_percentage, # fileSchemaType, # ) queryId = "TEST_22" query = tpch.get_tpch_query(queryId) # runTest.run_query( # bc, # drill, # query, # queryId, # queryType, # worder, # "", # acceptable_difference, # use_percentage, # fileSchemaType, # ) executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer", "orders", "nation", "region"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey""" runTest.run_query( bc, spark, #because Drill outputs some inf's instead of NaN query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select count(o_orderstatus) from orders where o_orderstatus <> 'O'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select avg(CAST(o_orderkey AS DOUBLE)) from orders group by o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as ( select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey ) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select c_custkey, CHAR_LENGTH(c_comment) from customer where MOD(CHAR_LENGTH(c_comment), 7) = 0""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = "select sum(CHAR_LENGTH(c_comment)) from customer" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """SELECT REPLACE(c_comment, 'in', '') as empty_replace, REPLACE(c_comment, 'the', '&&') as and_replace, REPLACE(c_comment, 'a', '$e*u') as a_replace FROM customer """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """SELECT REPLACE(SUBSTRING(c_comment, 2, 10), 'e', '&&') as rep_sub, CAST(REPLACE(c_comment, 'a', 'e') LIKE '%the%' AS INT) as rep_like, SUBSTRING(REPLACE(c_comment, 'e', '&#'), 2, 30) as sub_rep FROM customer """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_12" query = """SELECT TRIM(c_comment) as both_space_trim, TRIM(LEADING 'Cu' FROM c_name) as leading_char_trim, TRIM(TRAILING 'E' FROM c_mktsegment) as trailing_char_trim, LTRIM(c_comment) as left_trim, RTRIM(c_comment) as right_trim FROM customer """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_13" query = """SELECT TRIM(TRAILING 'er' FROM SUBSTRING(c_name, 0, 7)) as trim_subs, TRIM(LEADING 'CU' FROM UPPER(c_name)) as trim_upper, LOWER(TRIM('AE' FROM c_mktsegment)) as lower_trim, LTRIM(REPLACE(c_name, 'Customer', ' Test')) as ltrim_replace, CAST(RTRIM(c_comment) LIKE '%the%' AS INT) as rtrim_like FROM customer """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_14" query = """SELECT REVERSE(c_comment) as reversed_c, SUBSTRING(REVERSE(c_comment), 2, 10) as sub_reversed_c, CAST(SUBSTRING(REVERSE(c_comment), 2, 10) LIKE '%or%' AS INT) as cast_sub_reversed_c FROM customer """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_15" query = """SELECT COUNT(SUBSTRING(REVERSE(c_comment), 2, 10)) as reverse_subbed_count FROM customer GROUP BY SUBSTRING(REVERSE(c_comment), 2, 10) """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_16" query = """SELECT REGEXP_REPLACE(c_comment, 'a', 'the') as rep_a, REGEXP_REPLACE(c_comment, 'e|a|i|o|u', 'Z') as vowel_z, CHAR_LENGTH(REGEXP_REPLACE(c_comment, '[a-z]+', 'L')) char_len_rep, REGEXP_REPLACE(c_comment, '[a-z]+(.+)', 'Z') as capture_rep FROM customer """ runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_17" query_bsql = """SELECT REGEXP_REPLACE(c_comment, '[a-zA-Z]{2}([a-z]{2})([a-z]{2})', '\\2--\\1') as backref, REGEXP_REPLACE(c_comment, '[a-zA-Z]{2}([a-z]{1})([a-z]{1})', '\\2--\\1', 4) as backref_pos, REGEXP_REPLACE(c_comment, 'e|a|i|o|u', 'Z', 4) as vowel_pos, REGEXP_REPLACE(c_comment, '[a-z]+(.+)', 'Z', 10) as capture_pos FROM customer """ query_spark = """SELECT REGEXP_REPLACE(c_comment, '[a-zA-Z]{2}([a-z]{2})([a-z]{2})', '$2--$1') as backref, CONCAT( SUBSTRING(c_comment, 0, 3), REGEXP_REPLACE(SUBSTRING(c_comment, 4), '[a-zA-Z]{2}([a-z]{1})([a-z]{1})', '$2--$1') ) as backref_pos, CONCAT(SUBSTRING(c_comment, 0, 3), REGEXP_REPLACE(SUBSTRING(c_comment, 4), 'e|a|i|o|u', 'Z')) as vowel_pos, CONCAT(SUBSTRING(c_comment, 0, 9), REGEXP_REPLACE(SUBSTRING(c_comment, 10), '[a-z]+(.+)', 'Z')) as capture_pos FROM customer """ runTest.run_query( bc, spark, query_bsql, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_18" query = """select c_custkey, c_nationkey, c_name from customer where c_custkey > 300 and c_custkey < 600 order by c_nationkey, c_custkey""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_lc, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): # Read Data TPCH------------------------------------------------------ data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json for fileSchemaType in data_types: queryType = test_name(testName, fileSchemaType) if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_lc, fileSchemaType) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select count(c_custkey) as c1, count(c_acctbal) as c2 from customer""" algebra = """LogicalAggregate(group=[{}], c1=[COUNT($0)], c2=[COUNT($1)]) BindableTableScan(table=[[main, customer]], projects=[[0, 5]], aliases=[[c_custkey, c_acctbal]])""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, use_percentage, algebra=algebra) queryId = "TEST_02" query = """select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey""" algebra = """LogicalProject(EXPR$0=[$1], EXPR$1=[$2], EXPR$2=[/($2, $3)], EXPR$3=[$4], EXPR$4=[$5], c_nationkey=[/(+($5, $6), 2)]) LogicalAggregate(group=[{0}], EXPR$0=[COUNT($1)], EXPR$1=[SUM($2)], agg#2=[COUNT($2)], EXPR$3=[MIN($1)], EXPR$4=[MAX($0)], agg#5=[MIN($0)]) LogicalProject(c_nationkey=[$1], c_custkey=[$0], c_acctbal=[$2]) BindableTableScan(table=[[main, customer]], filters=[[<($0, 100)]], projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])""" # TODO: Change sum/count for avg KC # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, True, algebra=algebra) queryId = "TEST_03" query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" algebra = """LogicalProject(supp_nation=[$1], cust_nation=[$3], EXPR$2=[$2]) LogicalJoin(condition=[=($0, $3)], joinType=[inner]) LogicalProject(c_nationkey=[$1], n_nationkey=[$3], *=[$2]) LogicalJoin(condition=[=($0, $3)], joinType=[inner]) LogicalProject(s_nationkey=[$0], c_nationkey=[$4], *=[$2]) LogicalJoin(condition=[=($3, $1)], joinType=[inner]) LogicalProject(s_nationkey=[$0], o_custkey=[$4], *=[$2]) LogicalJoin(condition=[=($3, $1)], joinType=[inner]) LogicalProject(s_nationkey=[$1], l_orderkey=[$2], *=[$4]) LogicalJoin(condition=[=($0, $3)], joinType=[inner]) BindableTableScan(table=[[main, supplier]], projects=[[0, 3]], aliases=[[s_suppkey, s_nationkey]]) LogicalProject(l_orderkey=[$0], l_suppkey=[$1], *=[*($2, $3)]) BindableTableScan(table=[[main, lineitem]], projects=[[0, 2, 5, 6]], aliases=[[l_orderkey, l_suppkey, *]]) BindableTableScan(table=[[main, orders]], filters=[[<($0, 10000)]], projects=[[0, 1]], aliases=[[o_orderkey, o_custkey]]) BindableTableScan(table=[[main, customer]], projects=[[0, 3]], aliases=[[c_custkey, c_nationkey]]) BindableTableScan(table=[[main, nation]], filters=[[=($0, 1)]], projects=[[0]], aliases=[[n_nationkey]]) BindableTableScan(table=[[main, nation]], filters=[[=($0, 2)]], projects=[[0]], aliases=[[n_nationkey]])""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, algebra=algebra, ) queryId = "TEST_04" query = """select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(n2.n_nationkey) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" algebra = """LogicalProject(n1key=[$0], n2key=[$1], cstar=[$1]) LogicalAggregate(group=[{}], n1key=[COUNT($0)], cstar=[COUNT($1)]) LogicalProject(n_nationkey=[$0], n_nationkey0=[$1]) LogicalJoin(condition=[=($0, $2)], joinType=[full]) BindableTableScan(table=[[main, nation]], projects=[[0]], aliases=[[n_nationkey]]) LogicalProject(n_nationkey=[$0], $f4=[+($0, 6)]) BindableTableScan(table=[[main, nation]], projects=[[0]], aliases=[[n_nationkey, $f4]])""" # TODO: Change count(n2.n_nationkey) as cstar as count(*) # as cstar when it will be supported KC # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, use_percentage, algebra = algebra) queryId = "TEST_05" query = """select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" algebra = """LogicalProject(EXPR$0=[$2], EXPR$1=[$3], o_clerk=[$0]) LogicalAggregate(group=[{0, 1}], EXPR$0=[COUNT($2)], EXPR$1=[SUM($2)]) LogicalProject(o_clerk=[$3], o_orderstatus=[$2], o_orderkey=[$0]) BindableTableScan(table=[[main, orders]], filters=[[<($1, 1000)]], projects=[[0, 1, 2, 6]], aliases=[[o_orderkey, $f1, o_orderstatus, o_clerk]])""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, True, algebra = algebra) queryId = "TEST_06" # TODO: Change sum/count for avg KC query = """select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus""" algebra = """LogicalProject(EXPR$0=[/($1, $2)]) LogicalAggregate(group=[{0}], agg#0=[SUM($1)], agg#1=[COUNT($1)]) BindableTableScan(table=[[main, orders]], projects=[[2, 0]], aliases=[[o_orderstatus, o_orderkey]])""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, True, algebra = algebra) queryId = "TEST_07" query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as ( select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey ) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" algebra = """LogicalProject(r_name=[$1], n_name=[$3]) LogicalJoin(condition=[=($0, $2)], joinType=[inner]) BindableTableScan(table=[[main, region]], filters=[[>($0, 2)]], projects=[[0, 1]], aliases=[[r_regionkey, r_name]]) LogicalProject(fkey=[$2], n_name=[$1]) BindableTableScan(table=[[main, nation]], filters=[[>($0, 3)]], projects=[[0, 1, 2]], aliases=[[n_nationkey, n_name, fkey]])""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, use_percentage, algebra = algebra) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(testName, start_mem, end_mem)
def main(dask_client, drill, dir_data_lc, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "File System S3" def executionTest(queryType): # Read Data TPCH------------------------------------------------------ authority = "tpch_s3" awsS3BucketName = Settings.data["TestSettings"]["awsS3BucketName"] awsS3AccessKeyId = Settings.data["TestSettings"]["awsS3AccessKeyId"] awsS3SecretKey = Settings.data["TestSettings"]["awsS3SecretKey"] bc.s3( authority, bucket_name=awsS3BucketName, encryption_type=S3EncryptionType.NONE, access_key_id=awsS3AccessKeyId, secret_key=awsS3SecretKey, ) # dir_df = dir_data_lc[dir_data_lc.find("DataSet"):len(dir_data_lc)] dir_data_lc = "s3://" + authority + "/" + "DataSet100Mb2part/" tables = [ "nation", "region", "supplier", "customer", "lineitem", "orders" ] data_types = [DataType.CSV, DataType.PARQUET] # TODO json for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_lc, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------- # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select count(c_custkey) as c1, count(c_acctbal) as c2 from customer""" query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select count(n_nationkey), count(n_regionkey) from nation""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select count(s_suppkey), count(s_nationkey) from supplier""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) # TODO: Change sum/count for avg KC queryId = "TEST_05" query = """select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_custkey, c_acctbal""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >= 30""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_12" query = """select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" # TODO: Change count(n2.n_nationkey) as cstar as count(*) as cstar # when it will be supported KC runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_13" query = """select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_14" query = """select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey)) / (sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders where o_orderkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) # TODO: Change sum/count for avg KC queryId = "TEST_15" query = """select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) # TODO: Change sum/count for avg KC queryId = "TEST_16" query = """select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_17" query = """select count(o_orderstatus) from orders where o_orderstatus <> 'O'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_18" query = """select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_19" query = """select sum(o_orderkey)/count(o_orderkey) rom orders group by o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, fileSchemaType, ) # TODO: Change sum/count for avg KC queryId = "TEST_20" query = """select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_21" query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as ( select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey ) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = [ "nation", "region", "customer", "lineitem", "orders", "supplier" ] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select MIN(n.n_nationkey), MAX(r.r_regionkey), AVG(CAST((n.n_nationkey + r.r_regionkey) AS DOUBLE)) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey where n.n_nationkey IS NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select SUM(n1.n_nationkey) as n1key, AVG(CAST((n2.n_nationkey + n1.n_nationkey) AS DOUBLE)) as n2key from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 10 where n1.n_nationkey IS NOT NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", 0.01, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select COUNT(n1.n_nationkey) as n1key, COUNT(n2.n_nationkey + n1.n_nationkey) as n2key from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 10 where n1.n_nationkey IS NOT NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select COUNT(n1.n_regionkey), AVG(CAST(n1.n_regionkey AS DOUBLE)) from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 WHERE n1.n_regionkey IS NOT NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", 0.01, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select MIN(n.n_nationkey), MAX(n.n_nationkey) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey WHERE n.n_nationkey IS NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = 'TEST_06' query = """select COUNT(n.n_nationkey), AVG(r.r_regionkey) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey WHERE n.n_regionkey IS NULL""" runTest.run_query( bc, spark, # Drill shows: Different number of columns blzSQLresult: 2 PyDrill result: 0 query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select n.n_nationkey, n.n_name, r.r_regionkey, r.r_name from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey WHERE r.r_name IS NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select n.n_nationkey, n.n_name, r.r_regionkey, r.r_name from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey WHERE n.n_name IS NOT NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select MIN(n.n_nationkey), MAX(r.r_regionkey), AVG(CAST((n.n_nationkey + r.r_regionkey) AS DOUBLE)) from nation as n right outer join region as r on n.n_nationkey = r.r_regionkey where n.n_nationkey IS NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """select n.n_nationkey, n.n_name, r.r_regionkey, r.r_name from nation as n right outer join region as r on n.n_nationkey = r.r_regionkey WHERE r.r_name IS NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select n.n_nationkey, n.n_name, r.r_regionkey, r.r_name from nation as n right outer join region as r on n.n_nationkey = r.r_regionkey WHERE n.n_name IS NOT NULL""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer", "lineitem", "orders"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 0 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select c_custkey, c_acctbal from customer order by c_acctbal desc, c_custkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = "select c_acctbal from customer order by c_acctbal" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_acctbal, c_custkey desc""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select c_custkey + c_nationkey, c_acctbal from customer order by 1 desc, 2""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select l_linenumber, l_orderkey from lineitem where l_orderkey < 50000 order by l_linenumber desc, l_suppkey asc, l_partkey desc, l_orderkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders where o_orderkey < 100 order by o_custkey, o_orderstatus, o_shippriority, o_comment""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["nation", "region", "customer", "orders", "part", "partsupp", "supplier"] data_types = [DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET] # TODO json #Create Tables ------------------------------------------------------------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = """select maxPrice, avgSize from (select avg(CAST(p_size AS DOUBLE)) as avgSize, max(p_retailprice) as maxPrice, min(p_retailprice) as minPrice from part ) as partAnalysis order by maxPrice, avgSize""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = """select custOrders.avgPrice, custOrders.numOrders from customer inner join (select o_custkey as o_custkey, avg(o_totalprice) as avgPrice, count(o_totalprice) as numOrders from orders where o_custkey <= 100 group by o_custkey) as custOrders on custOrders.o_custkey = customer.c_custkey where customer.c_nationkey <= 5""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = """select partSuppTemp.partKey, partAnalysis.avgSize from (select min(p_partkey) as partKey, avg(CAST(p_size AS DOUBLE)) as avgSize, max(p_retailprice) as maxPrice, min(p_retailprice) as minPrice from part ) as partAnalysis inner join (select ps_partkey as partKey, ps_suppkey as suppKey from partsupp where ps_availqty > 2) as partSuppTemp on partAnalysis.partKey = partSuppTemp.partKey inner join (select s_suppkey as suppKey from supplier ) as supplierTemp on supplierTemp.suppKey = partSuppTemp.suppKey""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = """select avg(CAST(custKey AS DOUBLE)) from (select customer.c_custkey as custKey from (select min(o_custkey) as o_custkey from orders ) as tempOrders inner join customer on tempOrders.o_custkey = customer.c_custkey where customer.c_nationkey > 6) as joinedTables""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["lineitem", "orders"] data_types = [DataType.CSV, DataType.PARQUET] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.init_spark_schema( spark, Settings.data["TestSettings"]["dataDirectory"], fileSchemaType=fileSchemaType, ) cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select l_shipdate, l_commitdate, timestampdiff(DAY, l_commitdate, l_shipdate) as diff from lineitem limit 20""" query_spark = """select l_shipdate, l_commitdate, datediff(l_shipdate, l_commitdate) as diff from lineitem limit 20""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, query_spark=query_spark, print_result=True, ) queryId = "TEST_02" query = """select l_shipdate, timestampdiff(DAY, date '1970-01-01', l_shipdate) as diff from lineitem limit 20""" query_spark = """select l_shipdate, datediff(l_shipdate, date '1970-01-01') as diff from lineitem limit 20""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, query_spark=query_spark, print_result=True, ) queryId = "TEST_03" query = """select * from orders where timestampdiff(DAY, date '1995-02-04', o_orderdate) < 25""" query_spark = """select * from orders where datediff(o_orderdate, date '1995-02-04') < 25""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, query_spark=query_spark, print_result=True, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["nation", "region", "customer", "lineitem", "orders"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET ] # TODO json # Create Tables ------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select * from nation cross join region" runTest.run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = """ select o_orderkey, o_totalprice, l_linenumber, l_shipmode from orders cross join lineitem where o_orderkey < 6 and l_receiptdate > date '1996-07-12' and l_linenumber > 5 and o_totalprice < 74029.55 and o_clerk = 'Clerk#000000880' and l_shipmode IN ('FOB', 'RAIL') order by o_orderkey, o_totalprice, l_linenumber""" runTest.run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = """select o_orderkey, n_nationkey from nation cross join orders where o_totalprice > 4000.0 and o_orderdate > date '1998-07-12' and o_orderkey > 425000 group by o_orderkey, n_nationkey order by o_orderkey, n_nationkey""" runTest.run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = """ with cust_nation as ( select c_custkey, c_name, n_nationkey, n_name from customer inner join nation on c_nationkey = n_nationkey where n_nationkey > 21 and c_acctbal > 525.0 and c_custkey > 13450 order by c_custkey, n_nationkey ), ord_lineitem as ( select o_orderkey, l_quantity from orders left join lineitem on o_orderkey = l_orderkey where l_shipdate > date '1998-11-12' and o_totalprice > 3500.0 and l_quantity > 48.0 and l_shipmode in ('AIR', 'FOB', 'SHIP') order by o_orderkey ) select c_custkey, n_name, l_quantity from ord_lineitem cross join cust_nation where n_name = 'RUSSIA' order by c_custkey""" runTest.run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "String case" def executionTest(): tables = ["customer", "nation", "orders"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select c_custkey, UPPER(c_name) from customer where SUBSTRING(c_name,1,17) = 'Customer#00000000'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select c_custkey, upper(c_comment) from customer where c_mktsegment = 'household'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select LOWER(c_name), UPPER(c_address) from customer where c_custkey < 42""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select c.c_custkey, UPPER(SUBSTRING(c.c_name, 1, 8)), LOWER(SUBSTRING(c.c_name, 10, 18)) || '**' from customer c where c.c_custkey < 20""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select o_orderkey, upper(o_comment), lower(o_orderstatus) from orders where o_custkey < 120""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select lower(o_orderpriority), lower(o_orderstatus) from orders group by o_orderpriority, o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select count(o_orderkey), sum(o_orderkey), lower(o_clerk) from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select count(o_orderkey), sum(o_orderkey), upper(o_clerk) from orders where o_custkey < 1000 group by o_clerk, o_orderstatus""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select LOWER(n_name), UPPER(n_comment) from nation where n_regionkey = 4""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """select upper(n_comment), lower(upper(n_comment)) from nation where n_nationkey between 5 and 15""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, print_result=True, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_lc, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): # Read Data TPCH------------------------------------------------------ authority = "hadoop.docker.com:9000" ktoken = "../KrbHDFS/myconf/krb5cc_0" krbticket = os.path.abspath(ktoken) hdfs_host = "172.22.0.3" hdfs_port = 9000 hdfs_driver = "libhdfs" print("Using krb ticket: " + krbticket) result, error_msg, fs = bc.hdfs( authority, host=hdfs_host, port=hdfs_port, user="******", driver=hdfs_driver, kerb_ticket=krbticket, ) if result is False: msg = ("""WARNING: Could not connect to HDFS instance %s:%d using driver %s, error was: %s""" % (hdfs_host, hdfs_port, hdfs_driver, error_msg)) print(msg) print("WARNING: Will ignore " + queryType) return print("Success connection to HDFS:") print(fs) hdfs_dir_data_lc = "hdfs://" + authority + dir_data_lc print("TPCH files at: " + hdfs_dir_data_lc) tables = [ "nation", "region", "supplier", "customer", "lineitem", "orders", "part", ] # tables = ['customer'] # TODO json1 data_types = [DataType.CSV, DataType.ORC, DataType.PARQUET] for fileSchemaType in data_types: # if skip_test(dask_client, nRals, fileSchemaType, queryType): # continue cs.create_hive_tables(bc, hdfs_dir_data_lc, fileSchemaType, tables=tables) # Run Query ----------------------------------------------------- # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select count(c_custkey) as c1, count(c_acctbal) as c2 from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select count(n_nationkey), count(n_regionkey) from nation""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select count(s_suppkey), count(s_nationkey) from supplier""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey""" runTest.run_query(bc, drill, query, queryId, queryType, worder, "", 0.01, True) # TODO: Change sum/count for avg KC queryId = "TEST_05" query = """select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_custkey, c_acctbal""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2""" runTest.run_query( bc, drill, query, queryId, queryType, 0, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >= 30""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", 0.01, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_12" query = """select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(*) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_13" query = """select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_14" query = """select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey)) / (sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders where o_orderkey < 50""" runTest.run_query(bc, drill, query, queryId, queryType, worder, "", 0.01, True) # TODO: Change sum/count for avg KC queryId = "TEST_15" query = """select EXTRACT(YEAR FROM l_receiptdate) - EXTRACT(YEAR FROM l_shipdate) as years_late, EXTRACT(MONTH FROM l_receiptdate) - EXTRACT(MONTH FROM l_shipdate) as months_late, EXTRACT(DAY FROM l_receiptdate) - EXTRACT(DAY FROM l_shipdate) as days_late from lineitem where l_shipdate < DATE '1993-01-01'""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "Bindable Alias" def executionTest(queryType): tables = ["nation", "region", "lineitem", "orders"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO ORC gdf parquet json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order the # resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select n_regionkey as rkey, n_nationkey from nation where n_regionkey < 3""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, print_result=True, ) queryId = "TEST_02" query = """select n_nationkey, n_regionkey as nr from nation where n_regionkey < 3""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select n_regionkey as rkey, n_nationkey from nation where n_regionkey < 3 and n_nationkey > 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select n_nationkey as nkey, n_comment from nation where n_nationkey > 5""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = "select n_regionkey as rkey, n_nationkey from nation" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select n_regionkey as rkey, n_nationkey as nkey from nation""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select n_nationkey as nkey, n_regionkey as rkey from nation""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """(select r_name as name, r_regionkey as key from region ) union all (select n_name as name, n_nationkey as key from nation )""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select o_orderkey as okey, o_custkey as ckey, o_orderdate as odate from orders where o_orderpriority = '1-URGENT' order by okey, ckey, odate""" if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """select (l_quantity + 2) as new_quantity from lineitem group by l_quantity order by l_quantity asc limit 30""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select count(o_orderkey) as count_ok from orders group by o_orderstatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_12" query = "select n_nationkey from nation where n_regionkey < 3" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # if Settings.execution_mode == ExecutionMode.GENERATOR: # print("==============================") # break executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = 'Count Distinc' def executionTest(): tables = [ 'partsupp', 'lineitem', 'part', 'supplier', 'orders', 'customer', 'region', 'nation' ] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET ] # TODO json #Create Tables ------------------------------------------------------------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) #Run Query ----------------------------------------------------------------------------- worder = 1 use_percentage = False acceptable_difference = 0 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select count(distinct (n_regionkey + n_nationkey)), n_regionkey from nation group by n_regionkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, "n_regionkey", acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select count(distinct o_custkey), o_orderkey from orders where o_orderkey<100 group by o_orderkey, (o_orderkey + o_custkey)" runTest.run_query(bc, drill, query, queryId, queryType, worder, "o_orderkey", acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = "select count(distinct(o_orderkey + o_custkey)) as new_col, sum(o_orderkey), o_custkey from orders group by o_custkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, "o_custkey", acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_04' # query = "select count(distinct(o_custkey)), avg(o_totalprice), (o_orderkey + o_custkey) as num from orders where o_custkey < 100 group by o_custkey, o_orderkey" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', 0.01, use_percentage, fileSchemaType) queryId = 'TEST_05' query = "select count(distinct(o_custkey)), max(o_totalprice), min(o_totalprice), avg(o_totalprice) from orders group by o_custkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', 0.01, use_percentage, fileSchemaType) queryId = 'TEST_06' query = "select n_nationkey, count(distinct (n_regionkey + n_nationkey))/count(n_nationkey) from nation group by n_nationkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', 0.01, use_percentage, fileSchemaType) queryId = 'TEST_07' query = "select count(distinct(o_custkey)), count(distinct(o_totalprice)), sum(o_orderkey) from orders group by o_custkey" #count(distinct(o_orderdate)), runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_08' # query = "select COUNT(DISTINCT(n.n_nationkey)), AVG(r.r_regionkey) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', 0.01, use_percentage, fileSchemaType) queryId = 'TEST_09' query = "select MIN(n.n_nationkey), MAX(r.r_regionkey), COUNT(DISTINCT(n.n_nationkey + r.r_regionkey)) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_10' # query = "select COUNT(DISTINCT(n1.n_nationkey)) as n1key, COUNT(DISTINCT(n2.n_nationkey)) as n2key from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_regionkey" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_11' # query = "select r.r_regionkey, n.n_nationkey, COUNT(n.n_nationkey), COUNT(DISTINCT(r.r_regionkey)), SUM(DISTINCT(n.n_nationkey + r.r_regionkey)) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey GROUP BY r.r_regionkey, n.n_nationkey" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' query = "select n1.n_regionkey, n2.n_nationkey, MIN(n1.n_regionkey), MAX(n1.n_regionkey), AVG(n2.n_nationkey) from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 GROUP BY n1.n_regionkey, n2.n_nationkey" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), ceil(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = "select floor(c_acctbal), c_acctbal from customer" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["nation", "region", "customer", "orders", "lineitem"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select EXTRACT(YEAR FROM l_receiptdate) - EXTRACT(YEAR FROM l_shipdate) as years_late, EXTRACT(MONTH FROM l_receiptdate) - EXTRACT(MONTH FROM l_shipdate) as months_late, EXTRACT(DAY FROM l_receiptdate) - EXTRACT(DAY FROM l_shipdate) as days_late from lineitem where l_shipdate < DATE '1993-01-01'""" if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select o_orderkey as okey, o_custkey as ckey, (EXTRACT(YEAR FROM o_orderdate) - 5) from orders where o_orderstatus = 'O' order by okey""" if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select orders.o_orderkey, orders.o_orderdate, orders.o_orderstatus from orders inner join lineitem on lineitem.l_orderkey = orders.o_orderkey where orders.o_orderkey < 30 and lineitem.l_orderkey < 20 order by orders.o_orderkey, lineitem.l_linenumber, orders.o_custkey, lineitem.l_orderkey""" if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select customer.c_nationkey, customer.c_name, orders.o_orderdate, lineitem.l_receiptdate from customer left outer join orders on customer.c_custkey = orders.o_custkey inner join lineitem on lineitem.l_orderkey = orders.o_orderkey where customer.c_nationkey = 3 and customer.c_custkey < 100 and orders.o_orderdate < '1990-01-01' order by orders.o_orderkey, lineitem.l_linenumber""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select orders.o_orderkey, orders.o_orderdate, lineitem.l_receiptdate, orders.o_orderstatus from orders inner join lineitem on lineitem.l_receiptdate = orders.o_orderdate where orders.o_orderkey < 30 and lineitem.l_orderkey < 20 order by orders.o_orderkey, lineitem.l_linenumber""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # Tests for `CURRENT_DATE`, `CURRENT_TIME` and `CURRENT_TIMESTAMP` queryId = "TEST_06" query = """with current_table as ( select o_orderkey, current_date, o_custkey from orders where o_orderkey < 350 ) select o_orderkey, o_custkey from current_table""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # Note: As we don't have support for time dtype, then `current_time` will # return the same as `current_timestamp` queryId = "TEST_07" query = """with current_table as ( select o_orderkey, current_time, o_custkey, current_timestamp from orders where o_orderkey < 750 ) select o_orderkey, o_custkey from current_table limit 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # As `current_date`, `current_time` and `current_timestamp` always will return # different values by each new execution, let's not compare queryId = "TEST_08" query = """select current_date, o_orderkey, current_time, current_timestamp from orders where o_orderkey < 750""" runTest.run_query(bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, comparing="false") queryId = "TEST_09" query = """select orders.o_orderkey, CURRENT_DATE, orders.o_orderdate, lineitem.l_receiptdate, orders.o_orderstatus from orders inner join lineitem on lineitem.l_receiptdate = orders.o_orderdate where orders.o_orderkey < 40 and lineitem.l_orderkey < 30 order by orders.o_orderkey, lineitem.l_linenumber""" runTest.run_query(bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, comparing="false") # This test just compare the `CURRENT_DATE` value got from Blazing # against the `CURRENT DATE` from python (to validate the we get the righ current date) queryId = "TEST_10" query = """select CURRENT_DATE from region""" result = bc.sql(query) import dask_cudf if isinstance(result, dask_cudf.core.DataFrame): result = result.compute() current_blaz_date = result.to_pandas()['CURRENT_DATE'].astype( 'str').iloc[0] from datetime import date current_python_date = str(date.today()) if current_blaz_date != current_python_date: raise Exception( "Blazing CURRENT_DATE and python CURRENT DATE are differents" ) # if Settings.execution_mode == ExecutionMode.GENERATOR: # print("==============================") # break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_lc, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): #Read Data TPCH------------------------------------------------------------------------------------------------------------ tables = [ 'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders' ] data_types = [ DataType.DASK_CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET ] # TODO json for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_lc, fileSchemaType, tables=tables) #Run Query ----------------------------------------------------------------------------- worder = 1 #Parameter to indicate if its necessary to order the resulsets before compare them use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select count(c_custkey) as c1, count(c_acctbal) as c2 from customer" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select count(n_nationkey), count(n_regionkey) from nation" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = "select count(s_suppkey), count(s_nationkey) from supplier" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = "select count(c_custkey), sum(c_acctbal), sum(c_acctbal)/count(c_acctbal), min(c_custkey), max(c_nationkey), (max(c_nationkey) + min(c_nationkey))/2 c_nationkey from customer where c_custkey < 100 group by c_nationkey" runTest.run_query( bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True, fileSchemaType) #TODO: Change sum/count for avg KC queryId = 'TEST_05' query = "select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = "select c_custkey, c_nationkey, c_acctbal from customer order by c_nationkey, c_custkey, c_acctbal" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1, 2" runTest.run_query(bc, drill, query, queryId, queryType, 0, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_09' query = "select c_custkey, c_nationkey as nkey from customer where c_custkey < 0 and c_nationkey >=30" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' query = "select sin(c_acctbal), cos(c_acctbal), asin(c_acctbal), acos(c_acctbal), ln(c_acctbal), tan(c_acctbal), atan(c_acctbal), floor(c_acctbal), c_acctbal from customer" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_11' query = "select n1.n_nationkey as n1key, n2.n_nationkey as n2key, n1.n_nationkey + n2.n_nationkey from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 where n1.n_nationkey < 10 and n1.n_nationkey > 5" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_12' query = """select count(n1.n_nationkey) as n1key, count(n2.n_nationkey) as n2key, count(n2.n_nationkey) as cstar from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" runTest.run_query( bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType ) #TODO: Change count(n2.n_nationkey) as cstar as count(*) as cstar when it will be supported KC queryId = 'TEST_13' query = "select o_orderkey, o_custkey from orders where o_orderkey < 10 and o_orderkey >= 1" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_14' query = "select 100168549 - sum(o_orderkey)/count(o_orderkey), 56410984/sum(o_totalprice), (123 - 945/max(o_orderkey))/(sum(81619/o_orderkey)/count(81619/o_orderkey)) from orders" runTest.run_query( bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True, fileSchemaType) #TODO: Change sum/count for avg KC queryId = 'TEST_15' query = "select o_orderkey, sum(o_totalprice)/count(o_orderstatus) from orders where o_custkey < 100 group by o_orderstatus, o_orderkey" runTest.run_query( bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True, fileSchemaType) #TODO: Change sum/count for avg KC queryId = 'TEST_16' query = "select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus <> 'O' order by o_orderkey, o_orderstatus limit 50" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_17' query = "select count(o_orderstatus) from orders where o_orderstatus <> 'O'" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_18' query = "select count(o_orderkey), sum(o_orderkey), o_clerk from orders where o_custkey < 1000 group by o_clerk, o_orderstatus" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True, fileSchemaType) queryId = 'TEST_19' query = "select sum(o_orderkey)/count(o_orderkey) from orders group by o_orderstatus" runTest.run_query( bc, drill, query, queryId, queryType, worder, '', acceptable_difference, True, fileSchemaType) #TODO: Change sum/count for avg KC queryId = 'TEST_20' query = "select count(o_shippriority), sum(o_totalprice) from orders group by o_shippriority" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_21' query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as(select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["lineitem", "orders", "nation", "region", "customer"] data_types = [DataType.ORC] # TODO gdf csv parquet json for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select * from orders where o_orderdate > TIMESTAMP '1997-09-05 19:00:00' order by o_orderkey limit 10""" query_spark = """select * from orders where o_orderdate > TIMESTAMP '1997-09-05 19:00:00' order by o_orderkey nulls last limit 10""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_02" query = """select l_suppkey, l_shipdate from lineitem where l_shipdate < TIMESTAMP '1993-01-01 10:12:48' and l_suppkey < 100 order by l_orderkey""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select o_orderkey, o_orderdate from orders where o_orderdate >= TIMESTAMP '1997-09-05 19:00:00' order by o_orderkey limit 20""" query_spark = """select o_orderkey, o_orderdate from orders where o_orderdate >= TIMESTAMP '1997-09-05 19:00:00' order by o_orderkey nulls last limit 20""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_04" query = """select orders.o_orderkey, orders.o_orderdate, orders.o_orderstatus, lineitem.l_receiptdate from orders inner join lineitem on lineitem.l_orderkey = orders.o_orderkey where orders.o_orderkey < 70 and lineitem.l_orderkey < 120 and orders.o_orderdate < TIMESTAMP '1992-01-01 10:12:48' order by orders.o_orderkey, lineitem.l_linenumber, orders.o_custkey, lineitem.l_orderkey""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select customer.c_nationkey, customer.c_name, orders.o_orderdate from customer left outer join orders on customer.c_custkey = orders.o_custkey inner join lineitem on lineitem.l_orderkey = orders.o_orderkey where customer.c_nationkey = 3 and customer.c_custkey < 100 and orders.o_orderdate < '1990-01-01 20:00:00' order by orders.o_orderkey, lineitem.l_linenumber""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select orders.o_orderkey, orders.o_orderdate, orders.o_orderstatus from orders inner join lineitem on lineitem.l_orderkey = orders.o_orderkey where orders.o_orderkey < 20 and lineitem.l_orderkey < 16 order by orders.o_orderkey""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """select cast(o_orderdate as timestamp) from orders order by o_orderkey limit 5""" query_spark = """select cast(o_orderdate as timestamp) from orders order by o_orderkey nulls last limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark, ) queryId = "TEST_08" query = """select o_orderkey, cast(o_orderdate as timestamp) from orders where o_orderdate = date '1996-12-01' order by o_orderkey limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """with dateTemp as ( select o_orderdate from orders where o_orderdate > TIMESTAMP '1960-05-05 12:34:55' order by o_orderkey ) select count(*) from dateTemp""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_10" query = """with dateTemp as ( select o_orderdate from orders where o_orderdate <= TIMESTAMP '1996-12-01 00:00:00' order by o_orderdate, o_orderkey limit 5 ) select o_orderdate as myDate from dateTemp order by o_orderdate desc""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """with dateTemp as ( select o_orderdate from orders where o_orderdate = TIMESTAMP '1996-12-01 19:00:00' order by o_orderkey) select count(*) from dateTemp""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_12" query = """select o_orderkey, o_orderdate from orders where o_orderdate = date '1996-12-01' order by o_orderkey limit 5""" runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(drill, dir_data_lc, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "Mixed filesystem Test" def executionTest(queryType): # Register HDFS bc.hdfs("35.185.48.245", host="35.185.48.245", port=54310, user="******") dir_df = dir_data_lc[dir_data_lc.find("DataSet"): len(dir_data_lc)] dir_data_fs_hdfs = "hdfs://35.185.48.245/" + dir_df # Register S3 authority = "tpch_s3" # TODO percy kharo e2e-gpuci security bc.s3( authority, bucket_name="blazingsql-bucket", encryption_type=EncryptionType.NONE, access_key_id="", secret_key="", ) dir_df = dir_data_lc[dir_data_lc.find("DataSet"): len(dir_data_lc)] dir_data_fs_s3 = "s3://" + authority + "/" + dir_df # parque local region_files = cs.get_filenames_for_table("region", dir_data_lc, "parquet", "") bc.create_table("region", region_files) # csv local nation_files = cs.get_filenames_for_table("nation", dir_data_lc, "psv", "") bc.create_table( "nation", nation_files, delimiter="|", dtype=cs.get_dtypes("nation"), names=cs.get_column_names("nation"), ) # parquet from hdfs lineitem_files = cs.get_filenames_for_table( "lineitem", dir_data_lc, "parquet", dir_data_fs_hdfs ) bc.create_table("lineitem", lineitem_files) # parquet from S3 customer_files = cs.get_filenames_for_table( "customer", dir_data_lc, "parquet", "" ) bc.create_table("customer", customer_files) # csv con hdfs supplier_files = cs.get_filenames_for_table( "supplier", dir_data_lc, "psv", dir_data_fs_hdfs ) bc.create_table( "supplier", supplier_files, delimiter="|", dtype=cs.get_dtypes("supplier"), names=cs.get_column_names("supplier"), ) # csv from S3 orders_files = cs.get_filenames_for_table( "orders", dir_data_lc, "psv", dir_data_fs_s3 ) bc.create_table( "orders", orders_files, delimiter="|", dtype=cs.get_dtypes("orders"), names=cs.get_column_names("orders"), ) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" # parquet-s3, csv-local, parque-local query = """select sum(c.c_custkey)/count(c.c_custkey), sum(c.c_acctbal)/count(c.c_acctbal), n.n_nationkey, r.r_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey inner join region as r on r.r_regionkey = n.n_regionkey group by n.n_nationkey, r.r_regionkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_02" # parquet-s3, csv-s3 query = """select sum(custKey)/count(custKey) from (select customer.c_custkey as custKey from (select min(o_custkey) as o_custkey from orders ) as tempOrders inner join customer on tempOrders.o_custkey = customer.c_custkey where customer.c_nationkey > 6) as joinedTables""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_03" # csv-s3, parquet-hdfs query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from orders as o inner join lineitem as l on o.o_orderkey = l.l_orderkey where l.l_orderkey < 1000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_04" # csv-local, parquet-local, parquet-s3 query = """select sum(c.c_custkey)/count(c.c_custkey), sum(c.c_acctbal)/count(c.c_acctbal), n.n_nationkey, r.r_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey inner join region as r on r.r_regionkey = n.n_regionkey group by n.n_nationkey, r.r_regionkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, True, ) # TODO: Change sum/count for avg KC queryId = "TEST_05" # parquet-s3, csv-local query = """select c.c_custkey, c.c_nationkey, n.n_regionkey from customer as c inner join nation as n on c.c_nationkey = n.n_nationkey where n.n_regionkey = 1 and c.c_custkey < 50""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_06" # csv-hdfs, parquet-hdfs, parquet-s3, csv-local query = """select n1.n_nationkey as supp_nation, n2.n_nationkey as cust_nation, l.l_extendedprice * l.l_discount from supplier as s inner join lineitem as l on s.s_suppkey = l.l_suppkey inner join orders as o on o.o_orderkey = l.l_orderkey inner join customer as c on c.c_custkey = o.o_custkey inner join nation as n1 on s.s_nationkey = n1.n_nationkey inner join nation as n2 on c.c_nationkey = n2.n_nationkey where n1.n_nationkey = 1 and n2.n_nationkey = 2 and o.o_orderkey < 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) queryId = "TEST_07" # csv-local, parquet-local query = """with regionTemp as ( select r_regionkey, r_name from region where r_regionkey > 2 ), nationTemp as( select n_nationkey, n_regionkey as fkey, n_name from nation where n_nationkey > 3 order by n_nationkey ) select regionTemp.r_name, nationTemp.n_name from regionTemp inner join nationTemp on regionTemp.r_regionkey = nationTemp.fkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, # fileSchemaType, ) executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() queryType = "Coalesce" def executionTest(queryType): tables = ["nation", "region", "customer", "orders", "lineitem"] # TODO json data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order # the resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 # if fileSchemaType == DataType.PARQUET : print('Save file arrow') print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select n.n_nationkey, COALESCE(r.r_regionkey,-1) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey where n.n_nationkey < 10""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, print_result=True, ) queryId = "TEST_02" query = """select COALESCE(orders.o_orderkey, 100), COALESCE(orders.o_totalprice, 0.01) from customer left outer join orders on customer.c_custkey = orders.o_custkey where customer.c_nationkey = 3 and customer.c_custkey < 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select COALESCE(orders.o_orderkey, customer.c_custkey), COALESCE(orders.o_totalprice, customer.c_acctbal) from customer left outer join orders on customer.c_custkey = orders.o_custkey where customer.c_nationkey = 3 and customer.c_custkey < 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select customer.c_custkey, orders.o_orderkey, COALESCE(orders.o_custkey,123456) from customer left outer join orders on customer.c_custkey = orders.o_custkey where customer.c_nationkey = 3 and customer.c_custkey < 500""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_05" query = """select COUNT(DISTINCT(COALESCE(n1.n_regionkey,32))), AVG(COALESCE(n1.n_regionkey,32)) from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, use_percentage, fileSchemaType) queryId = "TEST_06" query = """select SUM(COALESCE(n2.n_nationkey, 100)), COUNT(DISTINCT(COALESCE(n1.n_nationkey,32))), n2.n_regionkey as n1key from nation as n1 full outer join nation as n2 on n1.n_nationkey = n2.n_nationkey + 6 GROUP BY n2.n_regionkey""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, use_percentage, fileSchemaType) queryId = "TEST_07" query = """select MIN(COALESCE(n.n_nationkey, r.r_regionkey)), MAX(COALESCE(n.n_nationkey, 8)) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """select AVG(CAST(COALESCE(n.n_nationkey, r.r_regionkey) AS DOUBLE)), MAX(COALESCE(n.n_nationkey, 8)), COUNT(COALESCE(n.n_nationkey, 12)), n.n_nationkey from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey GROUP BY n.n_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # queryId = 'TEST_09' # query = "select SUM(COALESCE(n2.n_nationkey, 100)), # COUNT(DISTINCT(COALESCE(n1.n_nationkey,32))), # COALESCE(n2.n_regionkey, 100) as n1key from nation as n1 # full outer join nation as n2 # on n1.n_nationkey = n2.n_nationkey + 6 # GROUP BY COALESCE(n2.n_regionkey, 100)" # runTest.run_query(bc, drill, query, queryId, queryType, worder, # '', acceptable_difference, use_percentage, fileSchemaType) queryId = "TEST_10" query = "SELECT COALESCE(l_shipinstruct, l_comment) FROM lineitem" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select n.n_nationkey, COALESCE(r.r_comment, n.n_comment) from nation as n left outer join region as r on n.n_nationkey = r.r_regionkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_12" query = """SELECT COALESCE(l.l_shipinstruct, o.o_orderstatus) FROM lineitem l inner join orders o on l.l_orderkey = o.o_orderkey where o.o_totalprice < 1574.23""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_13" query = """ WITH t1_l AS ( SELECT * FROM orders ), t1_r AS ( SELECT * FROM customer ), main_lr AS( SELECT COALESCE(o.o_comment, c.c_comment) AS info FROM t1_l o FULL JOIN t1_r c ON o.o_custkey = c.c_custkey AND o.o_orderkey = c.c_nationkey ) SELECT * FROM main_lr """ runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_14" query = """ WITH ltable3 AS ( select lineitem.l_orderkey as orderkey, lineitem.l_linestatus as linestatus from lineitem where mod(lineitem.l_orderkey, 2) = 0 ), rtable1 AS ( select lineitem.l_orderkey as orderkey, lineitem.l_linestatus as linestatus from lineitem where mod(lineitem.l_partkey, 6) = 0 ), rtable2 AS ( select lineitem.l_orderkey as orderkey, lineitem.l_linestatus as linestatus from lineitem where mod(lineitem.l_suppkey, 4) = 0 ), rtable3 AS ( select coalesce(l.orderkey, r.orderkey) as orderkey, coalesce(l.linestatus, r.linestatus) as linestatus from rtable1 l full join rtable2 r on l.orderkey = r.orderkey -- and l.linestatus = r.linestatus ), lastjoin AS ( select l.orderkey, coalesce(l.linestatus, r.linestatus) as linestatus from ltable3 l full join rtable3 r on l.orderkey = r.orderkey and l.linestatus = r.linestatus ) select * from lastjoin """ runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if Settings.execution_mode == ExecutionMode.GENERATOR: print("==============================") break executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["orders", "customer", "partsupp", "lineitem"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET ] # TODO json #Create Tables ------------------------------------------------------------------------------------------------------------ for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) #Run Query ----------------------------------------------------------------------------- worder = 0 use_percentage = False acceptable_difference = 0.01 print('==============================') print(queryType) print('==============================') queryId = 'TEST_01' query = "select o_orderkey from orders order by 1 limit 10" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_02' query = "select o_orderdate, o_orderkey, o_clerk from orders order by o_orderdate, o_orderkey, o_custkey, o_orderstatus, o_clerk limit 1000" query_spark = "select o_orderdate, o_orderkey, o_clerk from orders order by o_orderdate nulls last, o_orderkey nulls last, o_custkey nulls last, o_orderstatus nulls last, o_clerk nulls last limit 1000" if fileSchemaType == DataType.ORC: runTest.run_query(bc, spark, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark) else: runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_03' query = """select o_orderkey from orders where o_custkey < 300 and o_orderdate >= '1990-08-01' order by o_orderkey limit 50""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_04' query = """select ps_partkey, ps_availqty from partsupp where ps_availqty < 3 and ps_availqty >= 1 order by ps_partkey, ps_availqty limit 50""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) # queryId = 'TEST_05' # query = """select o_orderkey, o_orderstatus from orders where o_custkey < 10 and o_orderstatus = 'O' # order by o_orderkey, o_orderstatus limit 50""" # runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_06' query = """select orders.o_totalprice, customer.c_name from orders inner join customer on orders.o_custkey = customer.c_custkey order by customer.c_name, orders.o_orderkey limit 10""" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_07' query = """(select l_shipdate, l_orderkey, l_linestatus from lineitem where l_linenumber = 1 order by 1, 2, 3, l_linenumber limit 10) union all (select l_shipdate, l_orderkey, l_linestatus from lineitem where l_linenumber = 1 order by 1 desc, 2, 3, l_linenumber limit 10)""" query_spark = """(select l_shipdate, l_orderkey, l_linestatus from lineitem where l_linenumber = 1 order by 1 nulls last, 2 nulls last, 3 nulls last, l_linenumber nulls last limit 10) union all (select l_shipdate, l_orderkey, l_linestatus from lineitem where l_linenumber = 1 order by 1 desc nulls first, 2 nulls last, 3 nulls last, l_linenumber nulls last limit 10)""" if fileSchemaType == DataType.ORC: runTest.run_query(bc, spark, query, queryId, queryType, 1, '', acceptable_difference, use_percentage, fileSchemaType, query_spark=query_spark) else: runTest.run_query(bc, drill, query, queryId, queryType, 1, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_08' query = "select c_custkey from customer where c_custkey < 0 order by c_custkey limit 40" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_09' query = "select c_custkey, c_name from customer where c_custkey < 10 order by 1 limit 30" runTest.run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_10' query = "select c_custkey, c_name from customer where c_custkey < 10 limit 30" runTest.run_query(bc, drill, query, queryId, queryType, 1, '', acceptable_difference, use_percentage, fileSchemaType) queryId = 'TEST_11' query = "select avg(CAST(c_custkey AS DOUBLE)), min(c_custkey) from customer limit 5" runTest.run_query(bc, drill, query, queryId, queryType, 1, '', acceptable_difference, use_percentage, fileSchemaType) # if Settings.execution_mode == ExecutionMode.GENERATOR: # print("==============================") # break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)
def main(dask_client, drill, spark, dir_data_file, bc, nRals): start_mem = gpuMemory.capture_gpu_memory_usage() def executionTest(): tables = ["customer", "lineitem", "orders"] data_types = [ DataType.DASK_CUDF, DataType.CUDF, DataType.CSV, DataType.ORC, DataType.PARQUET, ] # TODO json # Create Tables ----------------------------------------------------- for fileSchemaType in data_types: if skip_test(dask_client, nRals, fileSchemaType, queryType): continue cs.create_tables(bc, dir_data_file, fileSchemaType, tables=tables) # Run Query ------------------------------------------------------ # Parameter to indicate if its necessary to order the # resulsets before compare them worder = 1 use_percentage = False acceptable_difference = 0.01 print("==============================") print(queryType) print("==============================") queryId = "TEST_01" query = """select count(c_custkey), sum(c_acctbal), avg(c_acctbal), min(c_custkey), max(c_nationkey), c_nationkey from customer group by c_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_02" query = """select count(c_custkey), sum(c_acctbal), count(c_acctbal), avg(c_acctbal), min(c_custkey), max(c_custkey), c_nationkey from customer where c_custkey < 50 group by c_nationkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_03" query = """select count(c_custkey) + sum(c_acctbal) + avg(c_acctbal), min(c_custkey) - max(c_nationkey), c_nationkey * 2 as key from customer where c_nationkey * 2 < 40 group by c_nationkey * 2""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_04" query = """select c_nationkey, count(c_acctbal) from customer group by c_nationkey, c_custkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if fileSchemaType != DataType.ORC: queryId = "TEST_05" query = """select l.l_suppkey, l.l_linestatus, min(l.l_commitdate), max(l.l_commitdate), max(l.l_orderkey), count(l.l_orderkey) FROM ( SELECT l_suppkey, l_linestatus, l_shipmode, l_orderkey, l_commitdate from lineitem WHERE l_linenumber = 6 and l_commitdate < DATE '1993-01-01' ) AS l LEFT OUTER JOIN orders AS o ON l.l_orderkey + 100 = o.o_orderkey GROUP BY l.l_suppkey, l.l_linestatus order by l.l_suppkey, l.l_linestatus limit 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_06" query = """select l.l_suppkey, l.l_linestatus, max(l.l_shipmode), max(l.l_orderkey), count(l.l_orderkey) FROM lineitem AS l LEFT OUTER JOIN orders AS o ON l.l_orderkey + 100 = o.o_orderkey GROUP BY l.l_suppkey, l.l_linestatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_07" query = """SELECT count(distinct l_orderkey), count(distinct l_partkey), count(distinct l_suppkey) FROM lineitem GROUP BY l_orderkey, l_partkey, l_suppkey""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_08" query = """SELECT count(distinct l_partkey, l_suppkey) FROM lineitem GROUP BY l_partkey, l_suppkey""" # Failed test with nulls # Reported issue: https://github.com/BlazingDB/blazingsql/issues/1403 testsWithNulls = Settings.data["RunSettings"]["testsWithNulls"] if testsWithNulls != "true": runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_09" query = """select l_orderkey, l_extendedprice, l_shipdate from lineitem where l_orderkey < 100 group by l_orderkey, l_extendedprice, l_shipdate, l_linestatus""" if fileSchemaType == DataType.ORC: runTest.run_query( bc, spark, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) else: runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) if fileSchemaType != DataType.ORC: queryId = "TEST_10" query = """select l.l_suppkey, l.l_linestatus, min(l.l_commitdate), max(l.l_commitdate), max(l.l_orderkey), count(l.l_orderkey) FROM ( SELECT l_suppkey, l_linestatus, l_shipmode, l_orderkey, l_commitdate from lineitem WHERE l_linenumber = 6 and l_commitdate < DATE '1993-01-01' ) AS l RIGHT OUTER JOIN orders AS o ON l.l_orderkey + 100 = o.o_orderkey GROUP BY l.l_suppkey, l.l_linestatus order by l.l_suppkey, l.l_linestatus limit 10000""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) queryId = "TEST_11" query = """select l.l_suppkey, l.l_linestatus, max(l.l_shipmode), max(l.l_orderkey), count(l.l_orderkey) FROM lineitem AS l RIGHT OUTER JOIN orders AS o ON l.l_orderkey + 100 = o.o_orderkey GROUP BY l.l_suppkey, l.l_linestatus""" runTest.run_query( bc, drill, query, queryId, queryType, worder, "", acceptable_difference, use_percentage, fileSchemaType, ) # if Settings.execution_mode == ExecutionMode.GENERATOR: # print("==============================") # break executionTest() end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem)