def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext(dask_client=None) if file_format == 'psv': dtypes = get_dtypes(table_name) col_names = get_column_names(table_name) bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names) else: bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: result = bc.sql(f'select distinct({partition}) from {table_name}') if type(result) is dask_cudf.core.DataFrame: result = result.compute() valuesPartition = result.to_pandas().to_dict() finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition])) dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))} valuesPartition[partition] = dictOfvalues data_partition_array_dict.append(valuesPartition) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)
def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext() bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: values = bc.sql(f'select distinct({partition}) from {table_name}') data_partition_array_dict.append(values.to_pandas().to_dict()) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)
class BlazingSQLHelper: def __init__(self): cluster = LocalCUDACluster() client = Client(cluster) self._bc = BlazingContext(dask_client=client, network_interface='lo') """This function runs blazingSQL query. :param config: Query related tables configuration. :type config: dict :return: Query results. :rtype: cudf.DataFrame """ def run_query(self, config): for table in config["tables"]: table_name = table["table_name"] file_path = table["input_path"] kwargs = table.copy() del kwargs["table_name"] del kwargs["input_path"] self._bc.create_table(table_name, file_path, **kwargs) sql = config["sql"] log.debug("Executing query: %s" % (sql)) result = self._bc.sql(sql) result = result.compute() return result """This function drops blazingSQL tables. :param table_names: List of table names to drop. :type table_names: List """ def drop_table(self, table_names): for table_name in table_names: log.debug("Drop table: %s" % (table_name)) self._bc.drop_table(table_name)
class BlazingSQLHelper: def __init__(self, pool=False): # Setting pool=True allocates half the GPU memory. self._bc = BlazingContext(pool=pool) """This function runs blazingSQL query. :param config: Query related tables configuration. :type config: dict :return: Query results. :rtype: cudf.DataFrame """ def run_query(self, config): for table in config["tables"]: table_name = table["table_name"] file_path = table["input_path"] kwargs = table.copy() del kwargs["table_name"] del kwargs["input_path"] self._bc.create_table(table_name, file_path, **kwargs) sql = config["sql"] log.debug("Executing query: %s" % (sql)) result = self._bc.sql(sql) self.has_data = False return result """This function drops blazingSQL tables. :param table_names: List of table names to drop. :type table_names: List """ def drop_table(self, table_names): for table_name in table_names: log.debug("Drop table: %s" % (table_name)) self._bc.drop_table(table_name)
import subprocess # client = Client('127.0.0.1:8786') # client.restart() # bc = BlazingContext(dask_client=client, network_interface="lo") bc = BlazingContext() authority = 'localhost:54310' hdfs_host = 'localhost' hdfs_port = 54310 hdfs_driver = 'libhdfs' result, error_msg, fs = bc.hdfs(authority, host=hdfs_host, port=hdfs_port, user='******', driver=hdfs_driver) cursor = hive.connect('localhost').cursor() table = bc.create_table('ptransactions', cursor, file_format='parquet') for i in range(11): query = "SELECT * FROM ptransactions where t_year=2017 and t_company_id={t_company_id} LIMIT 10".format( t_company_id=i) ddf = bc.sql(query) print(query) if isinstance(ddf, cudf.DataFrame): print(ddf) else: print(ddf.compute())
from dask.distributed import Client from blazingsql import BlazingContext from dask_cuda import LocalCUDACluster # initalize BlazingContext with the Dask Client of local GPUs to distribute query execution bc = BlazingContext(dask_client=Client(LocalCUDACluster()), network_interface='lo') # register public AWS S3 bucket bc.s3('blazingsql-colab', bucket_name='blazingsql-colab') # create a table from that S3 bucket col_names = [ 'key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count' ] bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names) # query the table & write results locally as parquet bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')