def gather_file_properties(self): """Gathers properties of the files loaded into the benchmark table. """ # gather file properties from the files' path # pylint: disable=line-too-long benchmark_details_pattern = \ r'fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\w+)' self.file_type, compression, self.num_columns, self.column_types, \ num_files, table_size = \ re.findall(benchmark_details_pattern, self.path)[0] self.compression_format = ( file_constants.FILE_CONSTANTS['compressionFormats'][compression]) # get schema from the staging table that the file was generated from source_staging_table_name = '{0:s}_{1:s}'.format( self.column_types, self.num_columns) source_staging_table_util = table_util.TableUtil( source_staging_table_name, self.staging_dataset_id, project=self.staging_project, ) if self.file_type == 'parquet' or self.file_type == 'avro': self.bq_schema = None else: self.bq_schema = source_staging_table_util.table.schema
def _set_benchmark_properties(self): """Sets properties from results of the benchmark load. Internal method that gathers and sets properties from the benchmark load to be used as results. """ # get properties from benchmark table benchmark_table_util = table_util.TableUtil( self.benchmark_table_name, self.benchmark_dataset_id, ) benchmark_table_util.set_table_properties() self.num_rows = benchmark_table_util.table.num_rows # get properties from the load job self.benchmark_time = self.load_job.created self.job_start_time = self.load_job.started self.job_end_time = self.load_job.ended self.job_duration = self.job_end_time - self.job_start_time self.job_user = self.load_job.user_email self.num_files = self.load_job.input_files self.job_location = self.load_job.location self.job_source_format = self.load_job.source_format # get properties from file # pylint: disable=line-too-long benchmark_details_pattern = \ r'gs://([\w\'-]+)/fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\d+)(\w+)' bucket, self.file_type, compression, self.num_columns, \ self.column_types, expected_num_files, self.staging_data_size, \ staging_data_unit = \ re.findall(benchmark_details_pattern, self.job_source_uri)[0] self.compression_format = ( file_constants.FILE_CONSTANTS['compressionFormats'][compression]) self.file_size = int( (self.load_job.input_file_bytes / int(self.num_files)) / BYTES_IN_MB) # get properties from BQ logs str_timestamp = str(self.benchmark_time) sharded_table_timestamp = str_timestamp.split(' ')[0].replace('-', '') abs_path = os.path.abspath(os.path.dirname(__file__)) log_query_file = os.path.join(abs_path, '../queries/log_query.txt') with open(log_query_file, 'r') as input_file: log_query_bql = input_file.read().format(self.project_id, self.bq_logs_dataset, sharded_table_timestamp, self.job_id) log_query_config = bigquery.QueryJobConfig() log_query_config.use_legacy_sql = False log_query_job = self.bq_client.query(query=log_query_bql, location='US', job_config=log_query_config) log_query_job.result() for row in log_query_job: self.totalSlotMs = row['totalSlotMs'] self.avgSlots = row['avgSlots']
def create_table(self): """Creates the bencmark table in BigQuery. The method creates an empty table using the schema from the staging table that the files were generated from. It uses the current timestamp to name the benchmark table to create a random, unique name. """ self.job_destination_table = '{0:d}'.format(int(time.time())) self.benchmark_table_util = table_util.TableUtil( self.job_destination_table, self.dataset_id, bq_schema=self.bq_schema, ) self.benchmark_table_util.create_table()
def create_files(self): """Creates all file combinations and store in GCS. Generates list of file combination from parameters in file_parameters.FILE_PARAMETERS dictionary, and creates each file in the list, as long as it doesn't yet exist in self.bucket. While each file is generated from a BigQuery staging table and stored in GCS, the method of creating the file varies depending on the parameters in the combination. """ # Gather file parameters and constants. files_consts = file_constants.FILE_CONSTANTS file_types = self.file_params['fileType'] extract_formats = files_consts['extractFormats'] file_compression_types = self.file_params['fileCompressionTypes'] file_counts = self.file_params['numFiles'] # Begin the process of iterating through each combination. logging.info('Starting to create files by exporting staging tables to ' 'bucket {0:s}'.format(self.bucket_name)) skip_message = 'Skipped path and its subsequent files: {0:s}' # Gather a list of the staging tables. The staging tables already # include the columnTypes, numColumns, and stagingDataSizes # parameters (ex: the staging table 100_STRING_10_10MB has # columnType=100_STRING, numColumns=10, and stagingDataSizes=10MB). tables = self.primitive_staging_tables if len(tables) == 0: logging.info('Dataset {0:s} contains no tables. Please create ' 'staging tables in {0:s}.'.format( self.primitive_staging_dataset_id)) # For each staging table, extract to each fileType, each # compressionType, and copy each file so that the combination has # the correct numFiles. for (table_list_item, file_type, num_files) in \ itertools.product(tables, file_types, file_counts): for compression_type in file_compression_types[file_type]: staging_table_util = table_util.TableUtil( table_list_item.table_id, table_list_item.dataset_id, ) staging_table_util.set_table_properties() gcs_prefix = 'gs://{0:s}/'.format(self.bucket_name) dest_string = ('fileType={0:s}/' 'compression={1:s}/' 'numColumns={2:d}/' 'columnTypes={3:s}/' 'numFiles={4:d}/' 'tableSize={5:d}MB/') destination_path = dest_string.format( file_type, compression_type, staging_table_util.num_columns, staging_table_util.column_types, num_files, int(staging_table_util.table_size / 1000000), ) if compression_type == 'none': extension = file_type else: extensions = (files_consts['compressionExtensions']) extension = extensions[compression_type] file_string = 'file1' destination_prefix = '{0:s}{1:s}{2:s}'.format( gcs_prefix, destination_path, file_string, ) if num_files == 1: # If the number of files in the current combination is 1, # check to see if the one file doesn't yet exist. blob_name = '{0:s}{1:s}.{2:s}'.format( destination_path, file_string, extension, ) if not storage.Blob( bucket=self.bucket, name=blob_name, ).exists(self.gcs_client): # If the one file doesn't yet exist, it needs to be # created. The method of creation depends on the file # type. if file_type == 'parquet': # If the file type is parquet, use the # _create_parquet_files() method, which uses # DataFlow for file creation. self._create_parquet_file( blob_name, staging_table_util, destination_prefix, ) else: # Otherwise, use the_extract_tables_to_files() # method, which uses BigQuery extract jobs. destination_format = extract_formats[file_type] self._extract_tables_to_files( blob_name, compression_type, destination_format, destination_prefix, extension, staging_table_util, ) else: # If the one file one file already exists, # skip its creation. logging.info(skip_message.format(blob_name)) else: # If the numFiles parameter in the current iteration is not # 1, that means multiple files need to be created for the # combination. In this case, obtain the file from the # combination in which all parameters are identical to the # current combination, except in which numFiles=1. file will # be used to make copies for the combinations where numFiles # > 1, since copying files is faster than running a new # extraction or DataFlow job. For example, if the current # combination is fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=100/tableSize=10MB/ # pylint: disable=line-too-long # then fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=1/tableSize=10MB/file1.csv # pylint: disable=line-too-long # will be used to make copies for the current combination. file1_destination_path = dest_string.format( file_type, compression_type, staging_table_util.num_columns, staging_table_util.column_types, 1, int(staging_table_util.table_size / 1000000), ) file1_blob_name = '{0:s}{1:s}.{2:s}'.format( file1_destination_path, file_string, extension, ) # Before making copies for the current combination, check # that the first file in the combination doesn't yet exist. # If it doesn't, then proceed. If it does exist, assume that # all other files in the combination already exist too. # While this can be a risky assumption, it saves a lot of # time, since combinations can contain as many as 10000 # files. If a combination is stopped in the middle of # generating a large number of files, the restart_ # incomplete_combination() method can be used to ensure the # combination gets completed without taking the time to # check each file's existence here. first_of_n_blobs = '{0:s}{1:s}.{2:s}'.format( destination_path, file_string, extension, ) if not storage.Blob( bucket=self.bucket, name=first_of_n_blobs, ).exists(self.gcs_client): # If the first file in the combination doesn't exist, # run copy_blobs() to create each file in the # combination. start_num = 1 self.copy_blobs( file1_blob_name, destination_path, extension, start_num, num_files, ) else: # Otherwise, skip creating the first file and all # subsequent files in the combination. logging.info(skip_message.format(first_of_n_blobs))
def main(argv=None): args = parse_args(argv) create_results_table = args.create_results_table create_benchmark_schemas = args.create_benchmark_schemas benchmark_table_schemas_dir = args.benchmark_table_schemas_directory create_staging_tables = args.create_staging_tables create_files = args.create_files restart_file = args.restart_file create_benchmark_tables = args.create_benchmark_tables duplicate_benchmark_tables = args.duplicate_benchmark_tables bq_project_id = args.bq_project_id benchmark_dataset_id = args.benchmark_dataset_id staging_project_id = args.staging_project_id staging_dataset_id = args.staging_dataset_id resized_staging_dataset_id = args.resized_staging_dataset_id results_table_name = args.results_table_name results_dataset_id = args.results_dataset_id results_table_schema_path = args.results_table_schema_path gcs_project_id = args.gcs_project_id bucket_name = args.bucket_name dataflow_temp_location = args.dataflow_temp_location dataflow_staging_location = args.dataflow_temp_location bq_logs_dataset = args.bq_logs_dataset file_params = file_parameters.FILE_PARAMETERS # Run provided commands if create_results_table: logging.info('Creating results table {0:s} from schema in ' '{1:s}.'.format( results_table_name, results_table_schema_path, )) results_table_util = table_util.TableUtil( table_id=results_table_name, dataset_id=results_dataset_id, json_schema_filename=results_table_schema_path, ) results_table_util.create_table() logging.info('Done creating results table.') if create_benchmark_schemas: benchmark_schema_creator = schema_creator.SchemaCreator( schemas_dir=benchmark_table_schemas_dir, file_params=file_params) benchmark_schema_creator.create_schemas() if create_staging_tables: benchmark_staging_table_generator = ( staging_table_generator.StagingTableGenerator( project=bq_project_id, staging_dataset_id=staging_dataset_id, resized_dataset_id=resized_staging_dataset_id, json_schema_path=benchmark_table_schemas_dir, file_params=file_params, num_rows=500)) benchmark_staging_table_generator.create_staging_tables( dataflow_staging_location=dataflow_staging_location, dataflow_temp_location=dataflow_staging_location, ) benchmark_staging_table_generator.create_resized_tables() if create_files: benchmark_file_generator = file_generator.FileGenerator( project_id=gcs_project_id, primitive_staging_dataset_id=resized_staging_dataset_id, bucket_name=bucket_name, file_params=file_params, dataflow_staging_location=dataflow_staging_location, dataflow_temp_location=dataflow_temp_location, ) if restart_file: benchmark_file_generator.restart_incomplete_combination( restart_file) benchmark_file_generator.create_files() if create_benchmark_tables: benchmark_tables_processor = tables_processor.TablesProcessor( bq_project=bq_project_id, gcs_project=gcs_project_id, staging_project=staging_project_id, staging_dataset_id=staging_dataset_id, dataset_id=benchmark_dataset_id, bucket_name=bucket_name, results_table_name=results_table_name, results_table_dataset_id=results_dataset_id, duplicate_benchmark_tables=duplicate_benchmark_tables, file_params=file_params, bq_logs_dataset=bq_logs_dataset, ) benchmark_tables_processor.create_benchmark_tables()