def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext(dask_client=None) if file_format == 'psv': dtypes = get_dtypes(table_name) col_names = get_column_names(table_name) bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names) else: bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: result = bc.sql(f'select distinct({partition}) from {table_name}') if type(result) is dask_cudf.core.DataFrame: result = result.compute() valuesPartition = result.to_pandas().to_dict() finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition])) dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))} valuesPartition[partition] = dictOfvalues data_partition_array_dict.append(valuesPartition) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)
def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet): if not os.path.exists(output): os.makedirs(output) bc = BlazingContext() bc.create_table(table_name, input) columns = bc.describe_table(table_name) data_partition_array_dict = [] for partition in partitions: if partition in columns: values = bc.sql(f'select distinct({partition}) from {table_name}') data_partition_array_dict.append(values.to_pandas().to_dict()) else: print('Column "' + partition + '" not exist') _save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)