コード例 #1
0
    def gather_file_properties(self):
        """Gathers properties of the files loaded into the benchmark table.
        """
        # gather file properties from the files' path
        # pylint: disable=line-too-long
        benchmark_details_pattern = \
            r'fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\w+)'
        self.file_type, compression, self.num_columns, self.column_types, \
            num_files, table_size = \
            re.findall(benchmark_details_pattern, self.path)[0]

        self.compression_format = (
            file_constants.FILE_CONSTANTS['compressionFormats'][compression])

        # get schema from the staging table that the file was generated from
        source_staging_table_name = '{0:s}_{1:s}'.format(
            self.column_types, self.num_columns)

        source_staging_table_util = table_util.TableUtil(
            source_staging_table_name,
            self.staging_dataset_id,
            project=self.staging_project,
        )
        if self.file_type == 'parquet' or self.file_type == 'avro':
            self.bq_schema = None
        else:
            self.bq_schema = source_staging_table_util.table.schema
コード例 #2
0
    def _set_benchmark_properties(self):
        """Sets properties from results of the benchmark load.

        Internal method that gathers and sets properties from the benchmark
        load to be used as results.
        """

        # get properties from benchmark table
        benchmark_table_util = table_util.TableUtil(
            self.benchmark_table_name,
            self.benchmark_dataset_id,
        )
        benchmark_table_util.set_table_properties()
        self.num_rows = benchmark_table_util.table.num_rows

        # get properties from the load job
        self.benchmark_time = self.load_job.created
        self.job_start_time = self.load_job.started
        self.job_end_time = self.load_job.ended
        self.job_duration = self.job_end_time - self.job_start_time
        self.job_user = self.load_job.user_email
        self.num_files = self.load_job.input_files
        self.job_location = self.load_job.location
        self.job_source_format = self.load_job.source_format

        # get properties from file
        # pylint: disable=line-too-long
        benchmark_details_pattern = \
            r'gs://([\w\'-]+)/fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\d+)(\w+)'
        bucket, self.file_type, compression, self.num_columns, \
            self.column_types, expected_num_files, self.staging_data_size, \
            staging_data_unit = \
            re.findall(benchmark_details_pattern, self.job_source_uri)[0]
        self.compression_format = (
            file_constants.FILE_CONSTANTS['compressionFormats'][compression])
        self.file_size = int(
            (self.load_job.input_file_bytes / int(self.num_files)) /
            BYTES_IN_MB)

        # get properties from BQ logs
        str_timestamp = str(self.benchmark_time)
        sharded_table_timestamp = str_timestamp.split(' ')[0].replace('-', '')
        abs_path = os.path.abspath(os.path.dirname(__file__))
        log_query_file = os.path.join(abs_path, '../queries/log_query.txt')
        with open(log_query_file, 'r') as input_file:
            log_query_bql = input_file.read().format(self.project_id,
                                                     self.bq_logs_dataset,
                                                     sharded_table_timestamp,
                                                     self.job_id)
        log_query_config = bigquery.QueryJobConfig()
        log_query_config.use_legacy_sql = False
        log_query_job = self.bq_client.query(query=log_query_bql,
                                             location='US',
                                             job_config=log_query_config)
        log_query_job.result()
        for row in log_query_job:
            self.totalSlotMs = row['totalSlotMs']
            self.avgSlots = row['avgSlots']
コード例 #3
0
    def create_table(self):
        """Creates the bencmark table in BigQuery.

        The method creates an empty table using the schema from the staging
        table that the files were generated from. It uses the current
        timestamp to name the benchmark table to create a random, unique name.
        """
        self.job_destination_table = '{0:d}'.format(int(time.time()))
        self.benchmark_table_util = table_util.TableUtil(
            self.job_destination_table,
            self.dataset_id,
            bq_schema=self.bq_schema,
        )
        self.benchmark_table_util.create_table()
コード例 #4
0
    def create_files(self):
        """Creates all file combinations and store in GCS.

        Generates list of file combination from parameters in
        file_parameters.FILE_PARAMETERS dictionary, and creates each file in
        the list, as long as it doesn't yet exist in self.bucket.
        While each file is generated from a BigQuery staging table and stored
        in GCS, the method of creating the file varies depending on the
        parameters in the combination.
        """
        # Gather file parameters and constants.
        files_consts = file_constants.FILE_CONSTANTS
        file_types = self.file_params['fileType']
        extract_formats = files_consts['extractFormats']
        file_compression_types = self.file_params['fileCompressionTypes']
        file_counts = self.file_params['numFiles']

        # Begin the process of iterating through each combination.
        logging.info('Starting to create files by exporting staging tables to '
                     'bucket {0:s}'.format(self.bucket_name))
        skip_message = 'Skipped path and its subsequent files: {0:s}'

        # Gather a list of the staging tables. The staging tables already
        # include the columnTypes, numColumns, and stagingDataSizes
        # parameters (ex: the staging table 100_STRING_10_10MB has
        # columnType=100_STRING, numColumns=10, and stagingDataSizes=10MB).
        tables = self.primitive_staging_tables
        if len(tables) == 0:
            logging.info('Dataset {0:s} contains no tables. Please create '
                         'staging tables in {0:s}.'.format(
                             self.primitive_staging_dataset_id))
        # For each staging table, extract to each fileType, each
        # compressionType, and copy each file so that the combination has
        # the correct numFiles.
        for (table_list_item, file_type, num_files) in \
                itertools.product(tables, file_types, file_counts):
            for compression_type in file_compression_types[file_type]:

                staging_table_util = table_util.TableUtil(
                    table_list_item.table_id,
                    table_list_item.dataset_id,
                )
                staging_table_util.set_table_properties()

                gcs_prefix = 'gs://{0:s}/'.format(self.bucket_name)
                dest_string = ('fileType={0:s}/'
                               'compression={1:s}/'
                               'numColumns={2:d}/'
                               'columnTypes={3:s}/'
                               'numFiles={4:d}/'
                               'tableSize={5:d}MB/')
                destination_path = dest_string.format(
                    file_type,
                    compression_type,
                    staging_table_util.num_columns,
                    staging_table_util.column_types,
                    num_files,
                    int(staging_table_util.table_size / 1000000),
                )

                if compression_type == 'none':
                    extension = file_type
                else:
                    extensions = (files_consts['compressionExtensions'])
                    extension = extensions[compression_type]

                file_string = 'file1'

                destination_prefix = '{0:s}{1:s}{2:s}'.format(
                    gcs_prefix,
                    destination_path,
                    file_string,
                )

                if num_files == 1:
                    # If the number of files in the current combination is 1,
                    # check to see if the one file doesn't yet exist.
                    blob_name = '{0:s}{1:s}.{2:s}'.format(
                        destination_path,
                        file_string,
                        extension,
                    )
                    if not storage.Blob(
                            bucket=self.bucket,
                            name=blob_name,
                    ).exists(self.gcs_client):
                        # If the one file doesn't yet exist, it needs to be
                        # created. The method of creation depends on the file
                        # type.
                        if file_type == 'parquet':
                            # If the file type is parquet, use the
                            # _create_parquet_files() method, which uses
                            # DataFlow for file creation.
                            self._create_parquet_file(
                                blob_name,
                                staging_table_util,
                                destination_prefix,
                            )
                        else:
                            # Otherwise, use the_extract_tables_to_files()
                            # method, which uses BigQuery extract jobs.
                            destination_format = extract_formats[file_type]
                            self._extract_tables_to_files(
                                blob_name,
                                compression_type,
                                destination_format,
                                destination_prefix,
                                extension,
                                staging_table_util,
                            )
                    else:
                        # If the one file one file already exists,
                        # skip its creation.
                        logging.info(skip_message.format(blob_name))
                else:
                    # If the numFiles parameter in the current iteration is not
                    # 1, that means multiple files need to be created for the
                    # combination. In this case, obtain the file from the
                    # combination in which all parameters are identical to the
                    # current combination, except in which numFiles=1. file will
                    # be used to make copies for the combinations where numFiles
                    # > 1, since copying files is faster than running a new
                    # extraction or DataFlow job. For example, if the current
                    # combination is fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=100/tableSize=10MB/ # pylint: disable=line-too-long
                    # then  fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=1/tableSize=10MB/file1.csv # pylint: disable=line-too-long
                    # will be used to make copies for the current  combination.
                    file1_destination_path = dest_string.format(
                        file_type,
                        compression_type,
                        staging_table_util.num_columns,
                        staging_table_util.column_types,
                        1,
                        int(staging_table_util.table_size / 1000000),
                    )
                    file1_blob_name = '{0:s}{1:s}.{2:s}'.format(
                        file1_destination_path,
                        file_string,
                        extension,
                    )
                    # Before making copies for the current combination, check
                    # that the first file in the combination doesn't yet exist.
                    # If it doesn't, then proceed. If it does exist, assume that
                    # all other files in the combination already exist too.
                    # While this can be a risky assumption, it saves a lot of
                    # time, since combinations can contain as many as 10000
                    # files. If a combination is stopped in the middle of
                    # generating a large number of files, the restart_
                    # incomplete_combination() method can be used to ensure the
                    # combination gets completed without taking the time to
                    # check each file's existence here.
                    first_of_n_blobs = '{0:s}{1:s}.{2:s}'.format(
                        destination_path,
                        file_string,
                        extension,
                    )
                    if not storage.Blob(
                            bucket=self.bucket,
                            name=first_of_n_blobs,
                    ).exists(self.gcs_client):
                        # If the first file in the combination doesn't exist,
                        # run copy_blobs() to create each file in the
                        # combination.
                        start_num = 1
                        self.copy_blobs(
                            file1_blob_name,
                            destination_path,
                            extension,
                            start_num,
                            num_files,
                        )
                    else:
                        # Otherwise, skip creating the first file and all
                        # subsequent files in the combination.
                        logging.info(skip_message.format(first_of_n_blobs))
コード例 #5
0
def main(argv=None):
    args = parse_args(argv)

    create_results_table = args.create_results_table
    create_benchmark_schemas = args.create_benchmark_schemas
    benchmark_table_schemas_dir = args.benchmark_table_schemas_directory
    create_staging_tables = args.create_staging_tables
    create_files = args.create_files
    restart_file = args.restart_file
    create_benchmark_tables = args.create_benchmark_tables
    duplicate_benchmark_tables = args.duplicate_benchmark_tables
    bq_project_id = args.bq_project_id
    benchmark_dataset_id = args.benchmark_dataset_id
    staging_project_id = args.staging_project_id
    staging_dataset_id = args.staging_dataset_id
    resized_staging_dataset_id = args.resized_staging_dataset_id
    results_table_name = args.results_table_name
    results_dataset_id = args.results_dataset_id
    results_table_schema_path = args.results_table_schema_path
    gcs_project_id = args.gcs_project_id
    bucket_name = args.bucket_name
    dataflow_temp_location = args.dataflow_temp_location
    dataflow_staging_location = args.dataflow_temp_location
    bq_logs_dataset = args.bq_logs_dataset

    file_params = file_parameters.FILE_PARAMETERS

    # Run provided commands
    if create_results_table:
        logging.info('Creating results table {0:s} from schema in '
                     '{1:s}.'.format(
                         results_table_name,
                         results_table_schema_path,
                     ))
        results_table_util = table_util.TableUtil(
            table_id=results_table_name,
            dataset_id=results_dataset_id,
            json_schema_filename=results_table_schema_path,
        )
        results_table_util.create_table()
        logging.info('Done creating results table.')

    if create_benchmark_schemas:
        benchmark_schema_creator = schema_creator.SchemaCreator(
            schemas_dir=benchmark_table_schemas_dir, file_params=file_params)
        benchmark_schema_creator.create_schemas()

    if create_staging_tables:
        benchmark_staging_table_generator = (
            staging_table_generator.StagingTableGenerator(
                project=bq_project_id,
                staging_dataset_id=staging_dataset_id,
                resized_dataset_id=resized_staging_dataset_id,
                json_schema_path=benchmark_table_schemas_dir,
                file_params=file_params,
                num_rows=500))
        benchmark_staging_table_generator.create_staging_tables(
            dataflow_staging_location=dataflow_staging_location,
            dataflow_temp_location=dataflow_staging_location,
        )
        benchmark_staging_table_generator.create_resized_tables()

    if create_files:
        benchmark_file_generator = file_generator.FileGenerator(
            project_id=gcs_project_id,
            primitive_staging_dataset_id=resized_staging_dataset_id,
            bucket_name=bucket_name,
            file_params=file_params,
            dataflow_staging_location=dataflow_staging_location,
            dataflow_temp_location=dataflow_temp_location,
        )
        if restart_file:
            benchmark_file_generator.restart_incomplete_combination(
                restart_file)
        benchmark_file_generator.create_files()

    if create_benchmark_tables:
        benchmark_tables_processor = tables_processor.TablesProcessor(
            bq_project=bq_project_id,
            gcs_project=gcs_project_id,
            staging_project=staging_project_id,
            staging_dataset_id=staging_dataset_id,
            dataset_id=benchmark_dataset_id,
            bucket_name=bucket_name,
            results_table_name=results_table_name,
            results_table_dataset_id=results_dataset_id,
            duplicate_benchmark_tables=duplicate_benchmark_tables,
            file_params=file_params,
            bq_logs_dataset=bq_logs_dataset,
        )
        benchmark_tables_processor.create_benchmark_tables()