Ejemplos de BlazingContext en Python, ejemplos de blazingsql.BlazingContext en Python

Ejemplo n.º 1

0

Mostrar archivo

def init_context():
    bc = None
    dask_client = None
    nRals = int(Settings.data["RunSettings"]["nRals"])
    nGpus = int(Settings.data["RunSettings"]["nGPUs"])
    if nRals == 1:
        bc = BlazingContext()
    else:
        os.chdir(Settings.data["TestSettings"]["logDirectory"])
        dask_client = try_to_get_dask_client(nRals, nGpus)
        if dask_client is not None:
            dask_conn = Settings.data["TestSettings"]["daskConnection"]
            iface = Settings.data["RunSettings"]["networkInterface"]
            print("Using dask: " + dask_conn)
            if "local" != dask_conn:
                dask_client.restart()
            bc = BlazingContext(
                dask_client=dask_client,
                network_interface=iface,
                pool=False,
                initial_pool_size=None,
                allocator="managed",
            )
        else:
            # Fallback: could not found a valid dask server
            bc = BlazingContext()
    return (bc, dask_client)

Ejemplo n.º 2

0

Mostrar archivo

def init_context(config_options={}):
    bc = None
    dask_client = None
    nRals = int(Settings.data["RunSettings"]["nRals"])
    nGpus = int(Settings.data["RunSettings"]["nGPUs"])
    if nRals == 1:
        bc = BlazingContext(config_options=config_options)
    else:
        os.chdir(Settings.data["TestSettings"]["logDirectory"])
        iface = Settings.data["RunSettings"]["networkInterface"]
        dask_client = try_to_get_dask_client(nRals, nGpus, iface)
        if dask_client is not None:
            dask_conn = Settings.data["TestSettings"]["daskConnection"]

            # print("Using dask: " + dask_conn)
            # if "local" != dask_conn:

            bc = BlazingContext(
                dask_client=dask_client,
                network_interface=iface,
                # pool=True,
                # initial_pool_size=300000000,
                allocator="default",
                config_options=config_options,
            )
        else:
            # Fallback: could not found a valid dask server
            bc = BlazingContext(config_options=config_options)
    return (bc, dask_client)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: utilityHive.py Proyecto: tspannhw/blazingsql

def testing_load_hive_table(table_name, location, partitions, partitions_schema):
	bc = BlazingContext()

	bc.create_table(table_name, location,
					file_format='parquet',
					hive_table_name=table_name,
					partitions=partitions,
					partitions_schema=partitions_schema)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: utilityHive.py Proyecto: gcca/blazingsql

def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files):
	if not os.path.exists(output):
		os.makedirs(output)

	bc = BlazingContext(dask_client=None)
	if file_format == 'psv':
		dtypes = get_dtypes(table_name)
		col_names = get_column_names(table_name)
		bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names)
	else:
		bc.create_table(table_name, input)

	columns = bc.describe_table(table_name)
	data_partition_array_dict = []
	for partition in partitions:
		if partition in columns:
			result = bc.sql(f'select distinct({partition}) from {table_name}')

			if type(result) is dask_cudf.core.DataFrame:
				result = result.compute()

			valuesPartition = result.to_pandas().to_dict()
			finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition]))
			dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))}
			valuesPartition[partition] = dictOfvalues
			data_partition_array_dict.append(valuesPartition)
		else:
			print('Column "' + partition + '" not exist')

	_save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: dask_tools.py Proyecto: quasiben/dors

def create_blazing_context(sched):
    from blazingsql import BlazingContext
    from dask.distributed import Client
    from dask_cuda import LocalCUDACluster

    if not sched:
        cluster = LocalCUDACluster()
        # Need dev version 1.18+ of reticulate
        # Error: C stack usage  193896484868 is too close to the limit
        client = Client(cluster)
    else:
        client = Client(sched)
    ctx = BlazingContext(dask_client=client, network_interface='lo')

    # FIX: develop better client handling on the R side
    ctx.dors_client = client
    return ctx

Ejemplo n.º 6

0

Mostrar archivo

Archivo: __init__.py Proyecto: quasiben/blazingsql

def init_context():
    bc = None
    dask_client = None
    nRals = int(Settings.data['RunSettings']['nRals'])
    nGpus = int(Settings.data['RunSettings']['nGPUs'])
    if nRals == 1:
        bc = BlazingContext()
    else:
        os.chdir(Settings.data['TestSettings']['logDirectory'])
        dask_client = try_to_get_dask_client(nRals, nGpus)
        if dask_client != None:
            dask_conn = Settings.data['TestSettings']['daskConnection']
            iface = Settings.data['RunSettings']['networkInterface']
            print("Using dask: " + dask_conn)
            if 'local' != dask_conn:
                dask_client.restart()
            bc = BlazingContext( dask_client = dask_client, network_interface=iface, pool=False, initial_pool_size=None, enable_logging=False, allocator="managed") 
        else:
            # Fallback: could not found a valid dask server
            bc = BlazingContext()
    return (bc, dask_client)

Ejemplo n.º 7

0

Mostrar archivo

def init_context(
    useProgressBar: bool = False,
    config_options={
        "ENABLE_GENERAL_ENGINE_LOGS": True,
        "ENABLE_COMMS_LOGS": True,
        "ENABLE_TASK_LOGS": True,
        "ENABLE_OTHER_ENGINE_LOGS": True
    }):
    bc = None
    dask_client = None
    nRals = int(Settings.data["RunSettings"]["nRals"])
    nGpus = int(Settings.data["RunSettings"]["nGPUs"])
    if nRals == 1:
        bc = BlazingContext(config_options=config_options,
                            enable_progress_bar=True)
    else:
        os.chdir(Settings.data["TestSettings"]["logDirectory"])
        iface = Settings.data["RunSettings"]["networkInterface"]
        dask_client = try_to_get_dask_client(nRals, nGpus, iface)
        if dask_client is not None:
            dask_conn = Settings.data["TestSettings"]["daskConnection"]

            # print("Using dask: " + dask_conn)
            # if "local" != dask_conn:

            bc = BlazingContext(
                dask_client=dask_client,
                network_interface=iface,
                # pool=True,
                # initial_pool_size=300000000,
                allocator="default",
                config_options=config_options,
                enable_progress_bar=True)
        else:
            # Fallback: could not found a valid dask server
            bc = BlazingContext(config_options=config_options,
                                enable_progress_bar=True)

    return (bc, dask_client)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: blazingsql_helper.py Proyecto: bsuryadevara/clx

class BlazingSQLHelper:
    def __init__(self):
        cluster = LocalCUDACluster()
        client = Client(cluster)
        self._bc = BlazingContext(dask_client=client, network_interface='lo')

    """This function runs blazingSQL query. 
    
    :param config: Query related tables configuration.
    :type config: dict
    :return: Query results.
    :rtype: cudf.DataFrame
    """

    def run_query(self, config):
        for table in config["tables"]:
            table_name = table["table_name"]
            file_path = table["input_path"]
            kwargs = table.copy()
            del kwargs["table_name"]
            del kwargs["input_path"]
            self._bc.create_table(table_name, file_path, **kwargs)
        sql = config["sql"]
        log.debug("Executing query: %s" % (sql))
        result = self._bc.sql(sql)
        result = result.compute()
        return result

    """This function drops blazingSQL tables.
    :param table_names: List of table names to drop.
    :type table_names: List
    """

    def drop_table(self, table_names):
        for table_name in table_names:
            log.debug("Drop table: %s" % (table_name))
            self._bc.drop_table(table_name)

Ejemplo n.º 9

0

Mostrar archivo

class BlazingSQLHelper:
    def __init__(self, pool=False):
        # Setting pool=True allocates half the GPU memory.
        self._bc = BlazingContext(pool=pool)

    """This function runs blazingSQL query. 
    
    :param config: Query related tables configuration.
    :type config: dict
    :return: Query results.
    :rtype: cudf.DataFrame
    """

    def run_query(self, config):
        for table in config["tables"]:
            table_name = table["table_name"]
            file_path = table["input_path"]
            kwargs = table.copy()
            del kwargs["table_name"]
            del kwargs["input_path"]
            self._bc.create_table(table_name, file_path, **kwargs)
        sql = config["sql"]
        log.debug("Executing query: %s" % (sql))
        result = self._bc.sql(sql)
        self.has_data = False
        return result

    """This function drops blazingSQL tables.
    :param table_names: List of table names to drop.
    :type table_names: List
    """

    def drop_table(self, table_names):
        for table_name in table_names:
            log.debug("Drop table: %s" % (table_name))
            self._bc.drop_table(table_name)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: utilityHive.py Proyecto: gcca/blazingsql

def testing_load_hive_table(table_name, file_format, location, partitions, partitions_schema):
	bc2 = BlazingContext(dask_client=None)

	if file_format == 'psv':
		dtypes = get_dtypes(table_name)
		col_names = get_column_names(table_name)
		bc2.create_table(table_name, location,
						file_format='csv',
						hive_table_name=table_name,
						partitions=partitions,
						partitions_schema=partitions_schema,
						delimiter="|",
						dtype=dtypes,
						names=col_names)
	else:
		bc2.create_table(table_name, location, file_format=file_format)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: utilityHive.py Proyecto: tspannhw/blazingsql

def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet):
	if not os.path.exists(output):
		os.makedirs(output)

	bc = BlazingContext()
	bc.create_table(table_name, input)

	columns = bc.describe_table(table_name)
	data_partition_array_dict = []
	for partition in partitions:
		if partition in columns:
			values = bc.sql(f'select distinct({partition}) from {table_name}')
			data_partition_array_dict.append(values.to_pandas().to_dict())
		else:
			print('Column "' + partition + '" not exist')

	_save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: csv_to_parquet.py Proyecto: schernolyas/bsql-demos

from dask.distributed import Client
from blazingsql import BlazingContext
from dask_cuda import LocalCUDACluster

# initalize BlazingContext with the Dask Client of local GPUs to distribute query execution
bc = BlazingContext(dask_client=Client(LocalCUDACluster()),
                    network_interface='lo')

# register public AWS S3 bucket
bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')

# create a table from that S3 bucket
col_names = [
    'key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y',
    'passenger_count'
]
bc.create_table('taxi',
                's3://blazingsql-colab/taxi_data/taxi_00.csv',
                names=col_names)

# query the table & write results locally as parquet
bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')

Ejemplo n.º 13

0

Mostrar archivo

Archivo: blazingsql_helper.py Proyecto: bsuryadevara/clx

 def __init__(self):
     cluster = LocalCUDACluster()
     client = Client(cluster)
     self._bc = BlazingContext(dask_client=client, network_interface='lo')

Ejemplo n.º 14

0

Mostrar archivo

Archivo: Query3.py Proyecto: N0m1z/GPU-DB-Benchmark

from blazingsql import BlazingContext
bc = BlazingContext()

company_types = ['int64', 'str', 'date', 'str', 'str', 'str']
bc.create_table('companies', '/home/nomis/20GB_CSV/Companies.csv', dtype=company_types)
address_types = ['int64', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'int64']
bc.create_table('addresses', '/home/nomis/20GB_CSV/Addresses.csv', dtype=address_types)
department_types = ['int64', 'str', 'int64']
bc.create_table('departments', '/home/nomis/20GB_CSV/Departments.csv', dtype=department_types)
category_types = ['int64', 'str', 'int64']
bc.create_table('categories', '/home/nomis/20GB_CSV/Categories.csv', dtype=category_types)
product_types = ['int64', 'str', 'str', 'str', 'str', 'date', 'float64', 'int64']
bc.create_table('products', '/home/nomis/20GB_CSV/Products.csv', dtype=product_types)
review_types = ['int64', 'str', 'str', 'date', 'int64']
bc.create_table('reviews', '/home/nomis/20GB_CSV/Reviews.csv', dtype=review_types)

query = '''
SELECT a.City
FROM Addresses a
    JOIN Companies c ON a.CompanyId = c.Id
    JOIN Departments d ON c.Id = d.CompanyId
    JOIN Categories k ON d.Id = k.DepartmentId
    JOIN Products p ON k.Id = p.CategoryId
    JOIN Reviews r ON p.Id = r.ProductId
WHERE YEAR(r.PublishDate) >= '2021'
GROUP BY a.City
    '''

af = bc.sql(query)

Ejemplo n.º 15

0

Mostrar archivo

 def __init__(self, pool=False):
     # Setting pool=True allocates half the GPU memory.
     self._bc = BlazingContext(pool=pool)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: performanceTest.py Proyecto: quasiben/blazingsql

def main(drill, dir_data_file, nRals):

    table1 = "customer"
    table2 = "orders"
    table3 = "lineitem"

    #Read Data TPCH------------------------------------------------------------------------------------------------------------
    customer_gdf = cs.read_data(table1, dir_data_file)
    orders_gdf = cs.read_data(table2, dir_data_file)
    lineitem_gdf = cs.read_data(table3, dir_data_file)

    #Create Tables ------------------------------------------------------------------------------------------------------------
    bc = BlazingContext()

    bc.create_table('customer', customer_gdf)
    bc.create_table('orders', orders_gdf)
    bc.create_table('lineitem', lineitem_gdf)

    #Run Query -----------------------------------------------------------------------------
    worder = 1  #Parameter to indicate if its necessary to order the resulsets before compare them
    use_percentage = False
    acceptable_difference = 0.01
    queryType = 'Performance Test'

    print('==============================')
    print(queryType)
    print('==============================')

    queryId = 'TEST_01'
    query = "select min(l_orderkey) from lineitem"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_02'
    query = "select sum(o_custkey), avg(o_totalprice), min(o_custkey), max(o_orderkey), count(o_orderkey) from orders"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_03'
    query = "select max(l_partkey) from lineitem group by l_orderkey"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_04'
    query = "select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_05'
    query = "select max(l_partkey) from lineitem group by l_orderkey, l_suppkey, l_linenumber"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_06'
    query = """select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey, 
    l_suppkey, l_linenumber"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_07'
    query = "select l_orderkey from lineitem group by l_orderkey"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_08'
    query = "select l_orderkey, l_extendedprice, l_shipdate from lineitem group by l_orderkey, l_extendedprice, l_shipdate, l_linestatus"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_09'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from orders as o
            inner join lineitem as l on o.o_orderkey = l.l_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_10'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l 
            inner join orders as o on o.o_orderkey = l.l_orderkey
            inner join customer as c on c.c_custkey = o.o_custkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_11'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l 
    inner join orders as o on o.o_orderkey = l.l_orderkey
    inner join customer as c on c.c_nationkey = o.o_custkey and l.l_linenumber = c.c_custkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_12'
    query = """select o.o_totalprice, l.l_partkey from orders as o
        left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_13'
    query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c
        left outer join orders as o on c.c_custkey = o.o_custkey
        left outer join lineitem as l on l.l_orderkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_14'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o
        full outer join lineitem as l on l.l_orderkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_15'
    query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c
        full outer join orders as o on c.c_custkey = o.o_custkey
        full outer join lineitem as l on l.l_orderkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_16'
    query = """select l_returnflag, l_shipdate, l_linestatus from lineitem where l_orderkey < 10000
        union all
        select l_returnflag, l_shipdate, l_linestatus from lineitem where l_partkey > 100"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_17'
    query = """select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderkey < 10000
        union all
        select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_custkey < 100000
        union all
        select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderstatus = 'O'
        union all
        select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_totalprice < 350"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_18'
    query = """select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders 
    where o_custkey<>4318470"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_19'
    query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag<>'g packages.'"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_20'
    query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders where o_custkey=88910"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_21'
    query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag='N'"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_22'
    query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice from orders where o_orderkey=1000"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_23'
    query = "select l_orderkey, l_partkey, l_suppkey, l_comment from lineitem where l_comment='dolites wake'"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_24'
    query = """select c_custkey, c_nationkey, c_acctbal from customer 
    where c_custkey < 15000 and c_nationkey = 5 or c_custkey = c_nationkey * c_nationkey or c_nationkey >= 10 
    or c_acctbal <= 500"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    #     queryId = 'TEST_25'
    #     query = "select o_orderkey, o_orderstatus, o_totalprice, o_comment from orders where o_orderstatus = 'O'"
    #     runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

    queryId = 'TEST_26'
    query = "select ((l_orderkey + l_partkey)/2)* l_quantity - l_suppkey*(l_discount - 1000.999) from lineitem"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_27'
    query = """select (l_orderkey + l_partkey*l_quantity)/2, (l_orderkey + l_quantity - l_partkey)*(l_orderkey + 3), 
    l_orderkey/l_partkey - l_quantity, l_quantity*l_linenumber/l_partkey*l_tax from lineitem"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_28'
    query = """select EXTRACT(YEAR FROM l_receiptdate) - EXTRACT(YEAR FROM l_shipdate) as years_late, 
        EXTRACT(MONTH FROM l_receiptdate) - EXTRACT(MONTH FROM l_shipdate) as months_late, 
        EXTRACT(DAY FROM l_receiptdate) - EXTRACT(DAY FROM l_shipdate) as days_late 
        from lineitem"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_29'
    query = """select l_quantity, sin(l_quantity), cos(l_quantity), asin(l_quantity), acos(l_quantity), ln(l_quantity), 
    tan(l_quantity), atan(l_quantity), floor(l_quantity), ceil(l_quantity) from lineitem"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_30'
    query = "select o_totalprice, o_custkey + o_custkey from orders order by o_totalprice"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_31'
    query = "select o_orderkey, o_custkey, o_orderstatus from orders order by o_comment"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_32'
    query = "select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderkey, o_custkey, o_totalprice"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_33'
    query = """select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderstatus, 
    o_shippriority, o_comment"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_34'
    query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1 desc, 2"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_35'
    query = """select custOrders.avgPrice, custOrders.numOrders from customer
    inner join
    (select o_custkey as o_custkey, avg(o_totalprice) as avgPrice, count(o_totalprice) as numOrders from orders
    where o_custkey <= 100 group by o_custkey) as custOrders
    on custOrders.o_custkey = customer.c_custkey
    where customer.c_nationkey <= 5"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_36'
    query = """with ordersTemp as ( 
            select min(o_orderkey) as priorityKey, o_custkey from orders group by o_custkey 
        ), ordersjoin as(
            select orders.o_custkey from orders inner join ordersTemp on ordersTemp.priorityKey = orders.o_orderkey
        ) 
        select customer.c_custkey, customer.c_nationkey from customer inner join ordersjoin on ordersjoin.o_custkey =  customer.c_custkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

Ejemplo n.º 17

0

Mostrar archivo

import time
import pprint
import cudf
from blazingsql import BlazingContext
from dask.distributed import Client
from pyhive import hive
import os
import subprocess

# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

authority = 'localhost:54310'
hdfs_host = 'localhost'
hdfs_port = 54310
hdfs_driver = 'libhdfs'
result, error_msg, fs = bc.hdfs(authority,
                                host=hdfs_host,
                                port=hdfs_port,
                                user='******',
                                driver=hdfs_driver)

cursor = hive.connect('localhost').cursor()

table = bc.create_table('ptransactions', cursor, file_format='parquet')
for i in range(11):
    query = "SELECT * FROM ptransactions where t_year=2017 and t_company_id={t_company_id} LIMIT 10".format(
        t_company_id=i)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: 03_explainplan_create_query_s3_table.py Proyecto: raymondtay/handsonbsql

from blazingsql import BlazingContext

bc = BlazingContext()

def main() :
    """
    main():
        parameters:
        -----------
        None
    
        description:
        ------------
        Creates a connection to the GPU underneath and holds it in "bc".
        Next, we access the "colab" environment S3 bucket (which holds a parquet
        file) and creates a table named "taxi" with it.
        Finally, this example ends off by saying that we wish to understand how
        BSQL is planning to extract the data rather than actually extracting it.
    
        result:
        -------
        'LogicalSort(fetch=[2])
             BindableTableScan(table=[[main, taxi]], projects=[[3, 4]], aliases=[[passenger_count, trip_distance]])
        '
    """
    global bc
    bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')
    
    bc.create_table('taxi', 's3://blazingsql-colab/yellow_taxi/taxi_data.parquet')
    
    query = 'SELECT passenger_count, trip_distance FROM taxi LIMIT 2'

Ejemplo n.º 19

0

Mostrar archivo

import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client
# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])
path_parquet = dir_data_fs + 'simple.parquet'
print(path_parquet)
bc.create_table('simple', path_parquet)

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = """select *
            from 
              simple

Ejemplo n.º 20

0

Mostrar archivo

Archivo: task_processor.py Proyecto: yjhjstz/blazingsql

import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client

from collections import OrderedDict
from blazingsql import BlazingContext

client = Client('127.0.0.1:8786')
bc = BlazingContext(dask_client=client, network_interface="lo")

bc.create_table('customer',
                '/home/aocsa/tpch/100MB2Part/tpch/customer_0_0.parquet')
bc.create_table('orders', '/home/aocsa/tpch/100MB2Part/tpch/orders_*.parquet')

algebra = """LogicalProject(c_custkey=[$0], c_nationkey=[$3])
  LogicalFilter(condition=[<($0, 10)])
    LogicalTableScan(table=[[main, customer]])
"""

plan = """
{
		'expr': 'LogicalProject(c_custkey=[$0], c_nationkey=[$3])',
		'children': [
			{
				'expr': 'LogicalFilter(condition=[<($0, 10)])',
				'children': [
					{
						'expr': 'LogicalTableScan(table=[[main, customer]])',
						'children': []
					}

Ejemplo n.º 21

0

Mostrar archivo

Archivo: manualTesting.py Proyecto: gcca/blazingsql

def main():

    install("gitpython")

    if not os.path.exists(conda_prefix + "/blazingsql-testing-files/"):
        git_clone()
    else:
        git_pull()

    unzip()

    queryType = ' Local Tests '

    if len(sys.argv) == 2:
        n_nodos = sys.argv[1]
    else:
        n_nodos = "1"

    if n_nodos == "1":
        print("Executing test in Single Node")
        bc = BlazingContext()
    else:
        print("Executing test in Distributed Mode")
        from dask.distributed import Client
        client = Client('127.0.0.1:8786')
        print("Dask client ready!")
        bc = BlazingContext(dask_client=client, network_interface='lo')

    log_dict = {}

    def executionLocalTest(queryType):

        #Read Data TPCH------------------------------------------------------------------------------------------------------------

        tables = [
            'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders',
            'part', 'partsupp'
        ]

        data_types = [DataType.PARQUET]  # TODO json

        for fileSchemaType in data_types:
            create_tables(bc, data_dir, fileSchemaType, tables=tables)

            #   Run Query -----------------------------------------------------------------------------
            worder = 1  # Parameter to indicate if its necessary to order the resulsets before compare them
            use_percentage = False
            acceptable_difference = 0.01

            print('==============================')
            print(queryType)
            print('==============================')

            queryId = 'TEST_01'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_02'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_03'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_04'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            queryId = 'TEST_05'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_06'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_07'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

    executionLocalTest(queryType)

    queryType = ' S3 Tests '

    def executionS3Test(queryType):

        #Read Data TPCH------------------------------------------------------------------------------------------------------------

        authority = "tpch_s3"

        print(authority)
        print(bucket_name)
        hola = S3EncryptionType.NONE
        print(hola)
        print(access_key_id)
        print(secret_key)

        bc.s3(authority,
              bucket_name=bucket_name,
              encryption_type=S3EncryptionType.NONE,
              access_key_id=access_key_id,
              secret_key=secret_key)

        dir_data_lc = "s3://" + authority + "/" + "DataSet100Mb2part/"

        tables = [
            'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders',
            'part', 'partsupp'
        ]
        data_types = [DataType.PARQUET]  # TODO json

        for fileSchemaType in data_types:
            create_tables(bc, data_dir, fileSchemaType, tables=tables)

            #   Run Query -----------------------------------------------------------------------------
            worder = 1  # Parameter to indicate if its necessary to order the resulsets before compare them
            use_percentage = False
            acceptable_difference = 0.01

            print('==============================')
            print(queryType)
            print('==============================')

            queryId = 'TEST_08'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_09'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            queryId = 'TEST_10'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_11'
            #print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

            queryId = 'TEST_12'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_13'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_14'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

    executionS3Test(queryType)

    queryType = ' GS Tests '

    def executionGSTest(queryType):

        authority = "tpch_gs"

        bc.gs(authority,
              project_id=gs_project_id,
              bucket_name=gs_bucket_name,
              use_default_adc_json_file=True,
              adc_json_file='')

        dir_data_lc = 'gcs://' + authority + '/100MB2Part/'

        tables = [
            'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders',
            'part', 'partsupp'
        ]
        data_types = [DataType.PARQUET]

        for fileSchemaType in data_types:
            create_tables(bc, data_dir, fileSchemaType, tables=tables)

            #   Run Query -----------------------------------------------------------------------------
            worder = 1  # Parameter to indicate if its necessary to order the resulsets before compare them
            use_percentage = False
            acceptable_difference = 0.01

            print('==============================')
            print(queryType)
            print('==============================')

            queryId = 'TEST_15'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_16'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            queryId = 'TEST_17'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_18'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_19'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_20'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_21'
            print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_22'
            #print("Executing " + queryId + " ... ")
            query = tpch.get_tpch_query(queryId)
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

    executionGSTest(queryType)

    green = bcolors.OKGREEN
    endc = bcolors.ENDC

    print(green + "=======================================")
    print("SUMMARY TESTS")
    print("=======================================" + endc)

    for key, value in log_dict.items():
        print('"{}" : {} '.format(key, value))

    print(green + "=======================================" + endc)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: dsample.py Proyecto: yjhjstz/blazingsql

import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client
client = Client('127.0.0.1:8786')
client.restart()
bc = BlazingContext(dask_client=client, network_interface="lo")

# bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/100MB2Part/'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])

bc.create_table('customer', dir_data_fs + '/customer_*.parquet')

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = "select sum(c_custkey)/count(c_custkey), min(c_custkey) from customer limit 5"

# [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment']
lp = bc.explain(query)

Ejemplo n.º 23

0

Mostrar archivo

import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client
# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/DataSet5Part100MB'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])

bc.create_table('customer', [
    dir_data_fs + '/customer_0_0.parquet', dir_data_fs +
    '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet',
    dir_data_fs + '/customer_3_0.parquet',
    dir_data_fs + '/customer_4_0.parquet'
])

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

Ejemplo n.º 24

0

Mostrar archivo

Archivo: cluster_startup.py Proyecto: eyal0/tpcx-bb

def attach_to_cluster(config, create_blazing_context=False):
    """Attaches to an existing cluster if available.
    By default, tries to attach to a cluster running on localhost:8786 (dask's default).

    This is currently hardcoded to assume the dashboard is running on port 8787.

    Optionally, this will also create a BlazingContext.
    """
    host = config.get("cluster_host")
    port = config.get("cluster_port", "8786")

    if host is not None:
        try:
            content = requests.get(
                "http://" + host +
                ":8787/info/main/workers.html").content.decode("utf-8")
            url = content.split("Scheduler ")[1].split(":" + str(port))[0]
            client = Client(address=f"{url}:{port}")
            print(f"Connected to {url}:{port}")
        except requests.exceptions.ConnectionError as e:
            sys.exit(
                f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}"
            )
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    else:
        raise ValueError("Must pass a cluster address to the host argument.")

    def maybe_create_worker_directories(dask_worker):
        worker_dir = dask_worker.local_directory
        if not os.path.exists(worker_dir):
            os.mkdir(worker_dir)

    client.run(maybe_create_worker_directories)

    # Get ucx config variables
    ucx_config = client.submit(_get_ucx_config).result()
    config.update(ucx_config)

    # Save worker information
    gpu_sizes = ["16GB", "32GB", "40GB"]
    worker_counts = worker_count_info(client, gpu_sizes=gpu_sizes)
    for size in gpu_sizes:
        key = size + "_workers"
        if config.get(
                key) is not None and config.get(key) != worker_counts[size]:
            print(
                f"Expected {config.get(key)} {size} workers in your cluster, but got {worker_counts[size]}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts."
            )
            sys.exit(-1)

    config["16GB_workers"] = worker_counts["16GB"]
    config["32GB_workers"] = worker_counts["32GB"]
    config["40GB_workers"] = worker_counts["40GB"]

    bc = None
    if create_blazing_context:
        bc = BlazingContext(
            dask_client=client,
            pool=os.environ.get("BLAZING_POOL", False),
            network_interface=os.environ.get("INTERFACE", "ib0"),
            config_options=get_config_options(),
            allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "managed"),
            initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE",
                                             None))

    return client, bc

Ejemplo n.º 25

0

Mostrar archivo

Archivo: cluster_startup.py Proyecto: lilintongtech/tpcx-bb

def attach_to_cluster(config, create_blazing_context=False):
    """Attaches to an existing cluster if available.
    By default, tries to attach to a cluster running on localhost:8786 (dask's default).

    This is currently hardcoded to assume the dashboard is running on port 8787.

    Optionally, this will also create a BlazingContext.
    """
    scheduler_file = config.get("scheduler_file_path")
    host = config.get("cluster_host")
    port = config.get("cluster_port", "8786")

    if scheduler_file is not None:
        try:
            with open(scheduler_file) as fp:
                print(fp.read())
            client = Client(scheduler_file=scheduler_file)
            print('Connected!')
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    elif host is not None:
        try:
            content = requests.get(
                "http://" + host +
                ":8787/info/main/workers.html").content.decode("utf-8")
            url = content.split("Scheduler ")[1].split(":" + str(port))[0]
            client = Client(address=f"{url}:{port}")
            print(f"Connected to {url}:{port}")
            config["protocol"] = str(url)[0:3]
        except requests.exceptions.ConnectionError as e:
            sys.exit(
                f"Unable to connect to existing dask scheduler dashboard to determine cluster type: {e}"
            )
        except OSError as e:
            sys.exit(f"Unable to create a Dask Client connection: {e}")

    else:
        raise ValueError(
            "Must pass a scheduler file or cluster address to the host argument."
        )

    def maybe_create_worker_directories(dask_worker):
        worker_dir = dask_worker.local_directory
        if not os.path.exists(worker_dir):
            os.mkdir(worker_dir)

    client.run(maybe_create_worker_directories)

    # Get ucx config variables
    ucx_config = client.submit(_get_ucx_config).result()
    config.update(ucx_config)

    # Save worker information
    # Assumes all GPUs are the same size
    expected_workers = config.get("num_workers")
    worker_counts = worker_count_info(client)
    for gpu_size, count in worker_counts.items():
        if count != 0:
            current_workers = worker_counts.pop(gpu_size)
            break

    if expected_workers is not None and expected_workers != current_workers:
        print(
            f"Expected {expected_workers} {gpu_size} workers in your cluster, but got {current_workers}. It can take a moment for all workers to join the cluster. You may also have misconfigred hosts."
        )
        sys.exit(-1)

    config["16GB_workers"] = worker_counts.get("16GB", 0)
    config["32GB_workers"] = worker_counts.get("32GB", 0)
    config["40GB_workers"] = worker_counts.get("40GB", 0)

    bc = None
    if create_blazing_context:
        from blazingsql import BlazingContext
        bc = BlazingContext(
            dask_client=client,
            pool=os.environ.get("BLAZING_POOL", False),
            network_interface=os.environ.get("INTERFACE", "ib0"),
            config_options=get_bsql_config_options(),
            allocator=os.environ.get("BLAZING_ALLOCATOR_MODE", "managed"),
            initial_pool_size=os.environ.get("BLAZING_INITIAL_POOL_SIZE",
                                             None))

    return client, bc

Ejemplo n.º 26

0

Mostrar archivo

Archivo: tpcx_bb_query_24_sql.py Proyecto: cuulee/tpcx-bb

				imp_sk,
				price_change,
				SUM( CASE WHEN ((ss_sold_date_sk >= c.imp_start_date) AND (ss_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ss_quantity ELSE 0 END) AS current_ss_quant,
				SUM( CASE WHEN ((ss_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ss_sold_date_sk < c.imp_start_date)) THEN ss_quantity ELSE 0 END) AS prev_ss_quant
			FROM store_sales ss
			JOIN temp_table c ON c.i_item_sk = ss.ss_item_sk
			GROUP BY ss_item_sk, imp_sk, price_change
		) ss
		ON (ws.ws_item_sk = ss.ss_item_sk and ws.imp_sk = ss.imp_sk)
		GROUP BY  ws.ws_item_sk
  	"""

    result = bc.sql(query)
    return result


if __name__ == "__main__":
    client = attach_to_cluster(cli_args)

    bc = BlazingContext(
        dask_client=client,
        pool=True,
        network_interface=os.environ.get("INTERFACE", "eth0"),
    )

    result_df = main(cli_args["data_dir"])
    write_result(
        result_df,
        output_directory=cli_args["output_dir"],
    )

Ejemplo n.º 27

0

Mostrar archivo

def main():

    #gitpython

    install("gitpython")

    #install("tarfile")

    if not os.path.exists(conda_prefix + "/blazingsql-testing-files/"):
        git_clone()
    else:
        git_pull()

    unzip()

    queryType = ' Local Tests '

    bc = BlazingContext()

    log_dict = {}

    def executionLocalTest(queryType):

        #Read Data TPCH------------------------------------------------------------------------------------------------------------

        tables = [
            'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders',
            'part', 'partsupp'
        ]

        data_types = [DataType.PARQUET]  # TODO json

        for fileSchemaType in data_types:
            create_tables(bc, data_dir, fileSchemaType, tables=tables)

            #   Run Query -----------------------------------------------------------------------------
            worder = 1  # Parameter to indicate if its necessary to order the resulsets before compare them
            use_percentage = False
            acceptable_difference = 0.01

            print('==============================')
            print(queryType)
            print('==============================')

            queryId = 'TEST_01'
            print("Executing " + queryId + " ... ")
            query = """ select
                    l_returnflag, l_linestatus, sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price,
                    sum(l_extendedprice*(1-l_discount)) as sum_disc_price, sum(l_extendedprice*(1-l_discount)*(1+l_tax)) as sum_charge,
                    sum(l_quantity)/count(l_quantity) as avg_qty, sum(l_extendedprice)/count(l_extendedprice) as avg_price, sum(l_discount)/count(l_discount) as avg_disc,
                    count(*) as count_order
                from 
                    lineitem
                where
                    cast(l_shipdate as date) <= date '1998-09-01'
                group by
                    l_returnflag, l_linestatus
                order by
                    l_returnflag, l_linestatus"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_02'
            print("Executing " + queryId + " ... ")
            query = """ select 
                        s.s_acctbal, s.s_name, n.n_name, p.p_partkey, p.p_mfgr, s.s_address, s.s_phone, s.s_comment
                    from 
                        supplier as s 
                    INNER JOIN nation as n ON s.s_nationkey = n.n_nationkey 
                    INNER JOIN partsupp as ps ON s.s_suppkey = ps.ps_suppkey
                    INNER JOIN part as p ON p.p_partkey = ps.ps_partkey 
                    INNER JOIN region as r ON r.r_regionkey = n.n_regionkey
                    where r.r_name = 'EUROPE' and p.p_size = 15
                        and p.p_type like '%BRASS'
                        and ps.ps_supplycost = (
                            select 
                                min(psq.ps_supplycost)
                            from 
                                partsupp as psq
                            INNER JOIN supplier as sq ON sq.s_suppkey = psq.ps_suppkey
                            INNER JOIN nation as nq ON nq.n_nationkey = sq.s_nationkey
                            INNER JOIN region as rq ON rq.r_regionkey = nq.n_regionkey
                            where
                                rq.r_name = 'EUROPE'
                        )
                    order by 
                        s.s_acctbal desc, n.n_name, s.s_name, p.p_partkey"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_03'
            print("Executing " + queryId + " ... ")
            query = """ select 
                    l.l_orderkey, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, o.o_orderdate, o.o_shippriority
                from 
                    orders as o
                INNER JOIN lineitem as l ON l.l_orderkey = o.o_orderkey
                INNER JOIN customer as c ON c.c_custkey = o.o_custkey
                where
                    c.c_mktsegment = 'BUILDING'
                    and o.o_orderdate < date '1995-03-15' 
                    and l.l_shipdate > date '1995-03-15'
                group by
                    l.l_orderkey, o.o_orderdate, o.o_shippriority
                order by
                    revenue desc, o.o_orderdate"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_04'
            #print("Executing " + queryId + " ... ")
            query = """ select
                    o_orderpriority, count(*) as order_count
                from
                    orders
                where
                    o_orderdate >= date '1993-07-01'
                    and o_orderdate < date '1994-10-01'
                    and exists (select
                                    *
                                    from
                                    lineitem
                                    where
                                    l_orderkey = o_orderkey
                                    and l_commitdate < l_receiptdate)
                group by
                    o_orderpriority 
                order by 
                    o_orderpriority"""
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

            queryId = 'TEST_05'
            print("Executing " + queryId + " ... ")
            query = """ select
                        n.n_name, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue
                    from orders o 
                    inner join lineitem l on l.l_orderkey = o.o_orderkey
                    inner join customer c on o.o_custkey = c.c_custkey
                    inner join supplier s on l.l_suppkey = s.s_suppkey and c.c_nationkey = s.s_nationkey
                    inner join nation n on n.n_nationkey = s.s_nationkey
                    inner join region r on n.n_regionkey = r.r_regionkey
                    where
                        r.r_name = 'ASIA' 
                        and o.o_orderdate >= date '1994-01-01'
                        and o.o_orderdate < date '1995-01-01'                    
                    group by
                        n.n_name
                    order by
                        revenue desc"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_06'
            print("Executing " + queryId + " ... ")
            query = """ select
                    sum(l_extendedprice*l_discount) as revenue
                from
                    lineitem
                where
                    l_shipdate >= date '1994-01-01' 
                    and l_shipdate < date '1995-01-01'
                    and l_discount between 0.05 and 0.07 
                    and l_quantity < 24"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_07'
            print("Executing " + queryId + " ... ")
            query = """ select
                    supp_nation, cust_nation, l_year, sum(volume) as revenue
                from 
                    (select 
                        n1.n_name as supp_nation, n2.n_name as cust_nation,
                        extract(year from l.l_shipdate) as l_year, l.l_extendedprice * (1 - l.l_discount) as volume
                    from 
                        nation as n1
                    INNER JOIN supplier as s ON s.s_nationkey = n1.n_nationkey
                    INNER JOIN lineitem as l ON l.l_suppkey = s.s_suppkey 
                    INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey
                    INNER JOIN customer as c ON c.c_custkey = o.o_custkey
                    INNER JOIN nation as n2 ON n2.n_nationkey = c.c_nationkey
                    where
                        (
                            (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
                            or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') 
                        )
                    and cast(l.l_shipdate as date) between date '1995-01-01' and date '1996-12-31') as shipping
                group by
                    supp_nation, cust_nation, l_year
                order by 
                    supp_nation, cust_nation, l_year
                    """
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

    executionLocalTest(queryType)

    queryType = ' S3 Tests '

    def executionS3Test(queryType):

        #Read Data TPCH------------------------------------------------------------------------------------------------------------

        authority = "tpch_s3"

        bc.s3(authority,
              bucket_name=bucket_name,
              encryption_type=S3EncryptionType.NONE,
              access_key_id=access_key_id,
              secret_key=secret_key)

        dir_data_lc = "s3://" + authority + "/" + "DataSet100Mb2part/"

        tables = [
            'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders',
            'part', 'partsupp'
        ]
        data_types = [DataType.PARQUET]  # TODO json

        for fileSchemaType in data_types:
            create_tables(bc, data_dir, fileSchemaType, tables=tables)

            #   Run Query -----------------------------------------------------------------------------
            worder = 1  # Parameter to indicate if its necessary to order the resulsets before compare them
            use_percentage = False
            acceptable_difference = 0.01

            print('==============================')
            print(queryType)
            print('==============================')

            queryId = 'TEST_08'
            print("Executing " + queryId + " ... ")
            query = """ select 
                    o_year, sum(case when nationl = 'BRAZIL' then volume else 0 end) / sum(volume) as mkt_share
                from (
                    select 
                        extract(year from o.o_orderdate) as o_year, l.l_extendedprice * (1 - l.l_discount) as volume, n2.n_name as nationl
                    from 
                        part as p
                    INNER JOIN lineitem as l ON p.p_partkey = l.l_partkey
                    INNER JOIN supplier as s ON s.s_suppkey = l.l_suppkey
                    INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey 
                    INNER JOIN customer as c ON c.c_custkey = o.o_custkey
                    INNER JOIN nation as n1 ON n1.n_nationkey = c.c_nationkey 
                    INNER JOIN region as r ON r.r_regionkey = n1.n_regionkey
                    INNER JOIN nation as n2 ON n2.n_nationkey = s.s_nationkey
                    where 
                        r.r_name = 'AMERICA' 
                        and o.o_orderdate >= date '1995-01-01' and o.o_orderdate <= date '1996-12-31'
                        and p.p_type = 'ECONOMY ANODIZED STEEL' ) as all_nations
                group by
                    o_year
                order by
                    o_year"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_09'
            #print("Executing " + queryId + " ... ")
            query = """ select
                    nationl, o_year, sum(amount) as sum_profit
                from
                    ( select n_name as nationl, extract(year from o_orderdate) as o_year,
                            l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
                        from lineitem
                        INNER JOIN orders ON o_orderkey = l_orderkey
                        INNER JOIN partsupp ON ps_suppkey = l_suppkey and ps_partkey = l_partkey
                        INNER JOIN part ON p_partkey = l_partkey
                        INNER JOIN supplier ON s_suppkey = l_suppkey
                        INNER JOIN nation ON n_nationkey = s_nationkey                     
                        where
                            p_name like '%green%' ) as profit
                group by 
                    nationl, o_year 
                order by 
                    nationl, o_year desc"""
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

            queryId = 'TEST_10'
            print("Executing " + queryId + " ... ")
            query = """ select
                    c.c_custkey, c.c_name, sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, c.c_acctbal,
                    n.n_name, c.c_address, c.c_phone, c.c_comment
                from
                    customer as c
                INNER JOIN nation as n ON n.n_nationkey = c.c_nationkey
                INNER JOIN orders as o ON o.o_custkey = c.c_custkey
                INNER JOIN lineitem as l ON l.l_orderkey = o.o_orderkey
                where 
                o.o_orderdate >= date '1993-10-01'
                and o.o_orderdate < date '1994-10-01'
                and l.l_returnflag = 'R'
                group by
                    c.c_custkey, c.c_name, c.c_acctbal, c.c_phone, n.n_name, c.c_address, c.c_comment"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_11'
            #print("Executing " + queryId + " ... ")
            query = """ select
                            ps.ps_partkey, sum(ps.ps_supplycost * ps.ps_availqty) as valuep
                        from 
                            partsupp as ps
                        INNER JOIN supplier as s ON ps.ps_suppkey = s.s_suppkey
                        INNER JOIN nation as n ON s.s_nationkey = n.n_nationkey
                        where
                            n.n_name = 'GERMANY'
                        group by 
                            ps.ps_partkey having sum(ps.ps_supplycost * ps.ps_availqty) > ( select
                                                                        sum(psq.ps_supplycost)
                                                                        from
                                                                            partsupp as psq
                                                                        INNER JOIN supplier as sq ON psq.ps_suppkey = sq.s_suppkey
                                                                        INNER JOIN nation as nq ON sq.s_nationkey = nq.n_nationkey
                                                                        where
                                                                            nq.n_name = 'GERMANY'
                                                                        )
                        order by 
                            valuep desc"""
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

            queryId = 'TEST_12'
            print("Executing " + queryId + " ... ")
            query = """ select
                    l.l_shipmode, sum(case when o.o_orderpriority ='1-URGENT' or o.o_orderpriority ='2-HIGH'
                    then 1 else 0 end) as high_line_count, sum(case when o.o_orderpriority <> '1-URGENT'
                    and o.o_orderpriority <> '2-HIGH' then 1 else 0 end) as low_line_count
                from
                    lineitem as l
                INNER JOIN orders as o ON o.o_orderkey = l.l_orderkey
                where
                    l.l_shipmode in ('MAIL', 'SHIP') 
                    and l.l_commitdate < l.l_receiptdate
                    and l.l_shipdate < l.l_commitdate 
                    and l.l_receiptdate >= date '1994-01-01'
                    and l.l_receiptdate < date '1995-01-01'
                group by l.l_shipmode
                order by l.l_shipmode"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_13'
            print("Executing " + queryId + " ... ")
            query = """ select
                    c_count, count(*) as custdist
                from
                    (select
                        c.c_custkey, count(o.o_orderkey)
                    from
                        customer as c
                    LEFT OUTER JOIN orders as o ON c.c_custkey = o.o_custkey
                    where o.o_comment not like '%special%requests%'
                    group by
                        c.c_custkey) as c_orders (c_custkey, c_count)
                group by
                    c_count
                order by
                    custdist desc, c_count desc"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_14'
            print("Executing " + queryId + " ... ")
            query = """ select 100.00 * sum( case when p.p_type like 'PROMO%' then l.l_extendedprice * (1 - l.l_discount)
                                    else 0 end) / sum(l.l_extendedprice * (1 - l.l_discount) ) as promo_revenue
                from 
                    lineitem as l
                INNER JOIN part as p ON p.p_partkey = l.l_partkey
                where
                    l.l_shipdate >= date '1995-09-01' 
                    and l.l_shipdate < date '1995-10-01'"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

    executionS3Test(queryType)

    queryType = ' GS Tests '

    def executionGSTest(queryType):

        authority = "tpch_gs"

        bc.gs(authority,
              project_id=gs_project_id,
              bucket_name=gs_bucket_name,
              use_default_adc_json_file=True,
              adc_json_file='')

        dir_data_lc = 'gcs://' + authority + '/100MB2Part/'

        tables = [
            'nation', 'region', 'supplier', 'customer', 'lineitem', 'orders',
            'part', 'partsupp'
        ]
        data_types = [DataType.PARQUET]

        for fileSchemaType in data_types:
            create_tables(bc, data_dir, fileSchemaType, tables=tables)

            #   Run Query -----------------------------------------------------------------------------
            worder = 1  # Parameter to indicate if its necessary to order the resulsets before compare them
            use_percentage = False
            acceptable_difference = 0.01

            print('==============================')
            print(queryType)
            print('==============================')

            queryId = 'TEST_15'
            print("Executing " + queryId + " ... ")
            query = """with revenue (suplier_no, total_revenue) as
                (
                select
                l_suppkey, cast(sum(l_extendedprice * (1-l_discount)) as int)
                from
                lineitem
                where
                l_shipdate >= date '1996-01-01' and l_shipdate < date '1996-04-01'
                group by
                l_suppkey
                )
                select
                s.s_suppkey, s.s_name, s.s_address, s.s_phone, re.total_revenue
                from
                supplier as s
                INNER JOIN revenue as re on s.s_suppkey = re.suplier_no
                where
                re.total_revenue = cast((select max(total_revenue) from revenue) as int)
                order by
                s.s_suppkey"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_16'
            #print("Executing " + queryId + " ... ")
            query = """ select
                    p.p_brand, p.p_type, p.p_size, count(distinct ps.ps_suppkey) as supplier_cnt
                from partsupp ps
                inner join part p on p.p_partkey = ps.ps_partkey
                where
                    p.p_brand <> 'Brand#45'
                    and p.p_type not like 'MEDIUM POLISHED%' and p.p_size in (49, 14, 23, 45, 19, 3, 36, 9)
                    and ps.ps_suppkey not in (select s.s_suppkey from supplier s where s.s_comment like '%Customer%Complaints%')
                group by
                    p.p_brand, p.p_type, p.p_size
                order by
                    supplier_cnt desc, p.p_brand, p.p_type, p.p_size"""
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

            queryId = 'TEST_17'
            print("Executing " + queryId + " ... ")
            query = """ select
                    sum(l1.l_extendedprice) / 7.0 as avg_yearly
                from lineitem l1
                inner join part p on p.p_partkey = l1.l_partkey
                where
                    p.p_brand = 'Brand#23' and p.p_container = 'MED BOX'
                    and l1.l_quantity < (select 0.2 * avg(l2.l_quantity) from lineitem l2
                                    where l2.l_partkey = p.p_partkey)"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_18'
            print("Executing " + queryId + " ... ")
            query = """ select
                    c.c_name, c.c_custkey, o.o_orderkey, o.o_orderdate, o.o_totalprice, sum(l.l_quantity)
                from
                    customer c
                    inner join orders o on c.c_custkey = o.o_custkey
                    inner join lineitem l on o.o_orderkey = l.l_orderkey
                where
                    o.o_orderkey in (select l2.l_orderkey from lineitem l2 group by l2.l_orderkey having
                                    sum(l2.l_quantity) > 300)
                group by
                    c.c_name, c.c_custkey, o.o_orderkey, o.o_orderdate, o.o_totalprice
                order by
                    o.o_totalprice desc, o.o_orderdate"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_19'
            print("Executing " + queryId + " ... ")
            query = """ select 
                    sum(l.l_extendedprice * (1 - l.l_discount) ) as revenue
                from 
                    lineitem as l
                INNER JOIN part as p ON l.l_partkey = p.p_partkey
                where
                    (
                    p.p_brand = 'Brand#12'
                    and p.p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
                    and l.l_quantity >= 1 and l.l_quantity <= 11
                    and p.p_size between 1 and 5
                    and l.l_shipmode in ('AIR', 'AIR REG')
                    and l.l_shipinstruct = 'DELIVER IN PERSON'
                    )
                    or
                    (
                    p.p_brand = 'Brand#23'
                    and p.p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
                    and l.l_quantity >= 10 and l.l_quantity <= 20
                    and p.p_size between 1 and 10
                    and l.l_shipmode in ('AIR', 'AIR REG')
                    and l.l_shipinstruct = 'DELIVER IN PERSON'
                    )
                    or
                    (
                    p.p_brand = 'Brand#34'
                    and p.p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
                    and l.l_quantity >= 20 and l.l_quantity <= 30
                    and p.p_size between 1 and 15
                    and l.l_shipmode in ('AIR', 'AIR REG')
                    and l.l_shipinstruct = 'DELIVER IN PERSON'
                    )"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_20'
            print("Executing " + queryId + " ... ")
            query = """ select s.s_name, s.s_address
                    from supplier s
                inner join nation n on s.s_nationkey = n.n_nationkey
                where
                    s.s_suppkey in (select ps.ps_suppkey from partsupp ps where ps.ps_partkey in (select p.p_partkey
                                                                                        from part p where
                                                                                    p.p_name like 'forest%')
                                and ps_availqty > (select 0.5 * sum(l.l_quantity) from lineitem l where
                                                    l.l_partkey = ps.ps_partkey and l.l_suppkey = ps.ps_suppkey
                                                    and l.l_shipdate >= date '1994-01-01' 
                                                    and l.l_shipdate < date '1995-01-01'))
                    and n.n_name = 'CANADA'
                order by s.s_name"""
            result = run_query(bc, drill, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_21'
            print("Executing " + queryId + " ... ")
            query = """ select
                    s_name, count(*) as numwait
                from
                    supplier
                    inner join lineitem l1 on s_suppkey = l1.l_suppkey
                    inner join orders on o_orderkey = l1.l_orderkey
                    inner join nation on s_nationkey = n_nationkey
                where
                    o_orderstatus = 'F'
                    and l1.l_receiptdate > l1.l_commitdate and exists (select * from lineitem l2
                                                                        where l2.l_orderkey = l1.l_orderkey
                                                                            and l2.l_suppkey <> l1.l_suppkey)
                    and not exists (select * from lineitem l3 where l3.l_orderkey = l1.l_orderkey
                                    and l3.l_suppkey <> l1.l_suppkey and l3.l_receiptdate > l3.l_commitdate)
                    and n_name = ' SAUDI ARABIA'
                group by
                    s_name
                order by
                    numwait desc, s_name"""
            result = run_query(bc, spark, query, queryId, queryType, worder,
                               '', acceptable_difference, use_percentage,
                               fileSchemaType)

            log_dict[queryId] = result

            queryId = 'TEST_22'
            #print("Executing " + queryId + " ... ")
            query = """ select
                    cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal
                from (select substring(c_phone from 1 for 2) as cntrycode, c_acctbal from customer
                    where substring(c_phone from 1 for 2) in ('13','31','23','29','30','18','17')
                    and c_acctbal > (select avg(c_acctbal) from customer where c_acctbal > 0.00
                    and substring (c_phone from 1 for 2) in ('13','31','23','29','30','18','17'))
                    and not exists (select * from orders where o_custkey = c_custkey)) as custsale
                group by cntrycode
                order by cntrycode"""
            #result = run_query(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

    executionGSTest(queryType)

    green = bcolors.OKGREEN
    endc = bcolors.ENDC

    print(green + "=======================================")
    print("SUMMARY TESTS")
    print("=======================================" + endc)

    for key, value in log_dict.items():
        print('"{}" : {} '.format(key, value))

    print(green + "=======================================" + endc)