Esempio n. 1
0
    def gather_file_properties(self):
        """Gathers properties of the files loaded into the benchmark table.
        """
        # gather file properties from the files' path
        # pylint: disable=line-too-long
        benchmark_details_pattern = \
            r'fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\w+)'
        self.file_type, compression, self.num_columns, self.column_types, \
            num_files, table_size = \
            re.findall(benchmark_details_pattern, self.path)[0]

        self.compression_format = (
            file_constants.FILE_CONSTANTS['compressionFormats'][compression])

        # get schema from the staging table that the file was generated from
        source_staging_table_name = '{0:s}_{1:s}'.format(
            self.column_types, self.num_columns)

        source_staging_table_util = table_util.TableUtil(
            source_staging_table_name,
            self.staging_dataset_id,
            project=self.staging_project,
        )
        if self.file_type == 'parquet' or self.file_type == 'avro':
            self.bq_schema = None
        else:
            self.bq_schema = source_staging_table_util.table.schema
    def _set_job_properties(self):
        query_properties = dict()
        query_properties['query'] = self.bql
        query_properties['queryCategory'] = self.query_category

        # get properties from job
        query_properties['totalBytesBilled'] = self.job.total_bytes_billed
        query_properties[
            'totalBytesProcessed'] = self.job.total_bytes_processed

        # get main table properties
        main_table_properties = dict()
        main_table_properties['tableType'] = self.table_type
        main_table_properties['tableName'] = self.main_table_name
        main_table_util = table_util.TableUtil(self.main_table_name,
                                               self.table_dataset_id)
        main_table_util.set_table_properties()
        main_table_properties['equivalentBqTableSize'] = \
            main_table_util.size_in_mb
        main_table_properties['fileURI'] = self.file_uri

        # get properties from file path
        properties_from_file_path = self._get_properties_from_file_path(
            self.file_uri)
        main_table_properties.update(properties_from_file_path)
        del main_table_properties['stagingDataSize']

        query_properties['mainTable'] = main_table_properties
        self.results_dict['queryProperties'] = query_properties
Esempio n. 3
0
 def __init__(self, table_id, dataset_id):
     self.table_id = table_id
     self.dataset_id = dataset_id
     self.bq_table_util = table_util.TableUtil(self.table_id,
                                               self.dataset_id)
     self.bq_table_util.set_table_properties()
     self.schema = self.bq_table_util.table.schema
     self.query_strings = {}
Esempio n. 4
0
    def run_federated_query(self, query_type, query):
        """Runs native queries on EXTERNAL files

        Args:
            query_type(str): Code for the category of the query to
                run (SIMPLE_SELECT_*, SELECT_ONE_STRING, SELECT_50_PERCENT).
            query(str): The query to run.
        """
        file_formats = file_constants.FILE_CONSTANTS['sourceFormats']
        source_format = file_formats[self.file_type]
        external_config = bigquery.ExternalConfig(source_format=source_format)
        external_config.source_uris = [self.file_uri + '/*']
        if source_format != 'AVRO' and source_format != 'PARQUET':
            main_table_util = table_util.TableUtil(self.native_table_id,
                                                   self.dataset_id)
            external_config.schema = main_table_util.table.schema

        if source_format == 'CSV':
            external_config.options.skip_leading_rows = 1

        external_config.compression = self.compression.upper()
        table_id = self.native_table_id + '_external'
        results_destination = '{0:s}.{1:s}.{2:s}_query_results'.format(
            self.bq_project, self.dataset_id, table_id)
        logging.info(
            'Storing query results in {0:s}'.format(results_destination))
        job_config = bigquery.QueryJobConfig(
            table_definitions={table_id: external_config},
            use_legacy_sql=False,
            allow_large_results=True,
            destination=results_destination)
        bql = query.format(table_id)
        print(bql)
        query_job = self.bq_client.query(bql, job_config=job_config)
        logging.info("Running external {0:s} query.".format(query_type))
        query_job.result()
        query_result = benchmark_result_util.QueryBenchmarkResultUtil(
            job=query_job,
            job_type=self.job_type,
            benchmark_name=self.benchmark_name,
            project_id=self.bq_project,
            results_table_name=self.results_table_name,
            results_dataset_id=self.results_table_dataset_id,
            bq_logs_dataset=self.bq_logs_dataset_id,
            bql=bql,
            query_category=query_type,
            main_table_name=self.native_table_id,
            table_dataset_id=self.dataset_id,
            table_type=EXTERNAL_TYPE_ID,
            file_uri=self.file_uri,
        )
        query_result.insert_results_row()
        self.bq_client.delete_table(results_destination)
        logging.info('Deleting results destination table {0:s}'.format(
            results_destination))
Esempio n. 5
0
    def create_table(self):
        """Creates the bencmark table in BigQuery.

        The method creates an empty table using the schema from the staging
        table that the files were generated from. It uses the current
        timestamp to name the benchmark table to create a random, unique name.
        """
        self.job_destination_table = '{0:d}'.format(int(time.time()))
        self.benchmark_table_util = table_util.TableUtil(
            self.job_destination_table,
            self.dataset_id,
            bq_schema=self.bq_schema,
        )
        self.benchmark_table_util.create_table()
    def _set_job_properties(self):
        """Sets load specific properties."""
        load_properties = {}

        # get properties from benchmark table
        benchmark_table_util = table_util.TableUtil(self.load_table_id,
                                                    self.load_dataset_id)
        benchmark_table_util.set_table_properties()
        load_properties['numRows'] = benchmark_table_util.table.num_rows

        # get properties from the load job
        load_properties['numFiles'] = self.job.input_files
        load_properties['sourceFormat'] = self.job.source_format

        # get properties from file
        # pylint: disable=line-too-long
        benchmark_details_pattern = \
            r'gs://([\w\'-]+)/fileType=(\w+)/compression=(\w+)/numColumns=(\d+)/columnTypes=(\w+)/numFiles=(\d+)/tableSize=(\d+)(\w+)'
        bucket_name, file_type, compression, num_columns, column_types, \
            expected_num_files, staging_data_size, staging_data_unit = \
            re.findall(benchmark_details_pattern, self.job_source_uri)[0]
        compression_format = (
            file_constants.FILE_CONSTANTS['compressionFormats'][compression])
        file_name_prefix = 'fileType={0:s}/compression={1:s}/numColumns={2:s}/columnTypes={3:s}/numFiles={4:s}/tableSize={5:s}{6:s}'.format(
            file_type, compression, num_columns, column_types,
            expected_num_files, staging_data_size, staging_data_unit)
        bucket = self.storage_client.get_bucket(bucket_name)
        files_consts = file_constants.FILE_CONSTANTS
        if compression == 'none':
            file_ext = file_type
        else:
            file_ext = files_consts['compressionExtensions'][compression]

        file_name = '{0:s}/file1.{1:s}'.format(file_name_prefix, file_ext)

        file_size = float(bucket.get_blob(file_name).size) / BYTES_IN_MB

        load_properties['fileType'] = file_type
        load_properties['compressionType'] = compression_format
        load_properties['numColumns'] = num_columns
        load_properties['columnTypes'] = column_types
        load_properties['fileSize'] = file_size
        load_properties['stagingDataSize'] = staging_data_size
        load_properties['destinationTable'] = '{0:s}.{1:s}.{2:s}'.format(
            self.project_id, self.load_table_id, self.load_dataset_id)
        load_properties['sourceURI'] = self.job_source_uri

        self.results_dict['loadProperties'] = load_properties
    def _set_job_properties(self):
        """Sets load specific properties."""
        load_properties = dict()
        load_properties['destinationTable'] = '{0:s}.{1:s}.{2:s}'.format(
            self.project_id, self.load_dataset_id, self.load_table_id)
        load_properties['sourceURI'] = self.job_source_uri

        # get properties from benchmark table
        benchmark_table_util = table_util.TableUtil(self.load_table_id,
                                                    self.load_dataset_id)
        benchmark_table_util.set_table_properties()
        load_properties['numRows'] = benchmark_table_util.table.num_rows

        # get properties from the load job
        load_properties['sourceFormat'] = self.job.source_format

        # get properties from file
        properties_from_file_path = self._get_properties_from_file_path(
            self.job_source_uri)
        load_properties.update(properties_from_file_path)

        self.results_dict['loadProperties'] = load_properties
Esempio n. 8
0
    def create_files(self):
        """Creates all file combinations and store in GCS.

        Generates list of file combination from parameters in
        file_parameters.FILE_PARAMETERS dictionary, and creates each file in
        the list, as long as it doesn't yet exist in self.bucket.
        While each file is generated from a BigQuery staging table and stored
        in GCS, the method of creating the file varies depending on the
        parameters in the combination.
        """
        # Gather file parameters and constants.
        files_consts = file_constants.FILE_CONSTANTS
        file_types = self.file_params['fileType']
        extract_formats = files_consts['extractFormats']
        file_compression_types = self.file_params['fileCompressionTypes']
        file_counts = self.file_params['numFiles']

        # Begin the process of iterating through each combination.
        logging.info('Starting to create files by exporting staging tables to '
                     'bucket {0:s}'.format(self.bucket_name))
        skip_message = 'Skipped path and its subsequent files: {0:s}'

        # Gather a list of the staging tables. The staging tables already
        # include the columnTypes, numColumns, and stagingDataSizes
        # parameters (ex: the staging table 100_STRING_10_10MB has
        # columnType=100_STRING, numColumns=10, and stagingDataSizes=10MB).
        tables = self.primitive_staging_tables
        if len(tables) == 0:
            logging.info('Dataset {0:s} contains no tables. Please create '
                         'staging tables in {0:s}.'.format(
                            self.primitive_staging_dataset_id
                         )
            )
        # For each staging table, extract to each fileType, each
        # compressionType, and copy each file so that the combination has
        # the correct numFiles.
        for (table_list_item, file_type, num_files) in \
                itertools.product(tables, file_types, file_counts):
            for compression_type in file_compression_types[file_type]:

                staging_table_util = table_util.TableUtil(
                    table_list_item.table_id,
                    table_list_item.dataset_id,
                )
                staging_table_util.set_table_properties()

                gcs_prefix = 'gs://{0:s}/'.format(self.bucket_name)
                dest_string = ('fileType={0:s}/'
                               'compression={1:s}/'
                               'numColumns={2:d}/'
                               'columnTypes={3:s}/'
                               'numFiles={4:d}/'
                               'tableSize={5:d}MB/')
                destination_path = dest_string.format(
                    file_type,
                    compression_type,
                    staging_table_util.num_columns,
                    staging_table_util.column_types,
                    num_files,
                    int(staging_table_util.table_size/1000000),
                )

                if compression_type == 'none':
                    extension = file_type
                else:
                    extensions = (files_consts
                                  ['compressionExtensions'])
                    extension = extensions[compression_type]

                file_string = 'file1'

                destination_prefix = '{0:s}{1:s}{2:s}'.format(
                    gcs_prefix,
                    destination_path,
                    file_string,
                )

                if num_files == 1:
                    # If the number of files in the current combination is 1,
                    # check to see if the one file doesn't yet exist.
                    blob_name = '{0:s}{1:s}.{2:s}'.format(
                        destination_path,
                        file_string,
                        extension,
                    )
                    if not storage.Blob(
                            bucket=self.bucket,
                            name=blob_name,
                    ).exists(self.gcs_client):
                        # If the one file doesn't yet exist, it needs to be
                        # created. The method of creation depends on the file
                        # type.
                        if file_type == 'parquet':
                            # If the file type is parquet, use the
                            # _create_parquet_files() method, which uses
                            # DataFlow for file creation.
                            self._create_parquet_file(
                                blob_name,
                                staging_table_util,
                                destination_prefix,
                            )
                        else:
                            # Otherwise, use the_extract_tables_to_files()
                            # method, which uses BigQuery extract jobs.
                            destination_format = extract_formats[
                                file_type]
                            self._extract_tables_to_files(
                                blob_name,
                                compression_type,
                                destination_format,
                                destination_prefix,
                                extension,
                                staging_table_util,
                            )
                    else:
                        # If the one file one file already exists,
                        # skip its creation.
                        logging.info(skip_message.format(blob_name))
                else:
                    # If the numFiles parameter in the current iteration is not
                    # 1, that means multiple files need to be created for the
                    # combination. In this case, obtain the file from the
                    # combination in which all parameters are identical to the
                    # current combination, except in which numFiles=1. file will
                    # be used to make copies for the combinations where numFiles
                    # > 1, since copying files is faster than running a new
                    # extraction or DataFlow job. For example, if the current
                    # combination is fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=100/tableSize=10MB/ # pylint: disable=line-too-long
                    # then  fileType=csv/compression=none/numColumns=10/columnTypes=100_STRING/numFiles=1/tableSize=10MB/file1.csv # pylint: disable=line-too-long
                    # will be used to make copies for the current  combination.
                    file1_destination_path = dest_string.format(
                        file_type,
                        compression_type,
                        staging_table_util.num_columns,
                        staging_table_util.column_types,
                        1,
                        int(staging_table_util.table_size / 1000000),

                    )
                    file1_blob_name = '{0:s}{1:s}.{2:s}'.format(
                        file1_destination_path,
                        file_string,
                        extension,
                    )
                    # Before making copies for the current combination, check
                    # that the first file in the combination doesn't yet exist.
                    # If it doesn't, then proceed. If it does exist, assume that
                    # all other files in the combination already exist too.
                    # While this can be a risky assumption, it saves a lot of
                    # time, since combinations can contain as many as 10000
                    # files. If a combination is stopped in the middle of
                    # generating a large number of files, the restart_
                    # incomplete_combination() method can be used to ensure the
                    # combination gets completed without taking the time to
                    # check each file's existence here.
                    first_of_n_blobs = '{0:s}{1:s}.{2:s}'.format(
                        destination_path,
                        file_string,
                        extension,
                    )
                    if not storage.Blob(
                            bucket=self.bucket,
                            name=first_of_n_blobs,
                    ).exists(self.gcs_client):
                        # If the first file in the combination doesn't exist,
                        # run copy_blobs() to create each file in the
                        # combination.
                        start_num = 1
                        self.copy_blobs(
                            file1_blob_name,
                            destination_path,
                            extension,
                            start_num,
                            num_files,
                        )
                    else:
                        # Otherwise, skip creating the first file and all
                        # subsequent files in the combination.
                        logging.info(skip_message.format(
                            first_of_n_blobs
                        ))
Esempio n. 9
0
def main(argv=None):
    args = parse_args(argv)

    create_results_table = args.create_results_table
    create_benchmark_schemas = args.create_benchmark_schemas
    benchmark_table_schemas_dir = args.benchmark_table_schemas_directory
    create_staging_tables = args.create_staging_tables
    create_files = args.create_files
    restart_file = args.restart_file
    create_benchmark_tables = args.create_benchmark_tables
    duplicate_benchmark_tables = args.duplicate_benchmark_tables
    bq_project_id = args.bq_project_id
    benchmark_dataset_id = args.benchmark_dataset_id
    staging_project_id = args.staging_project_id
    staging_dataset_id = args.staging_dataset_id
    resized_staging_dataset_id = args.resized_staging_dataset_id
    results_table_name = args.results_table_name
    results_dataset_id = args.results_dataset_id
    results_table_schema_path = args.results_table_schema_path
    gcs_project_id = args.gcs_project_id
    bucket_name = args.bucket_name
    dataflow_temp_location = args.dataflow_temp_location
    dataflow_staging_location = args.dataflow_temp_location
    bq_logs_dataset = args.bq_logs_dataset

    file_params = load_file_parameters.FILE_PARAMETERS

    # Run provided commands
    if create_results_table:
        logging.info('Creating results table {0:s} from schema in '
                     '{1:s}.'.format(
                         results_table_name,
                         results_table_schema_path,
                     ))
        results_table_util = table_util.TableUtil(
            table_id=results_table_name,
            dataset_id=results_dataset_id,
            json_schema_filename=results_table_schema_path,
        )
        results_table_util.create_table()
        logging.info('Done creating results table.')

    if create_benchmark_schemas:
        benchmark_schema_creator = schema_creator.SchemaCreator(
            schemas_dir=benchmark_table_schemas_dir, file_params=file_params)
        benchmark_schema_creator.create_schemas()

    if create_staging_tables:
        benchmark_staging_table_generator = (
            staging_table_generator.StagingTableGenerator(
                project=bq_project_id,
                staging_dataset_id=staging_dataset_id,
                resized_dataset_id=resized_staging_dataset_id,
                json_schema_path=benchmark_table_schemas_dir,
                file_params=file_params,
                num_rows=500))
        benchmark_staging_table_generator.create_staging_tables(
            dataflow_staging_location=dataflow_staging_location,
            dataflow_temp_location=dataflow_staging_location,
        )
        benchmark_staging_table_generator.create_resized_tables()

    if create_files:
        benchmark_load_file_generator = load_file_generator.FileGenerator(
            project_id=gcs_project_id,
            primitive_staging_dataset_id=resized_staging_dataset_id,
            bucket_name=bucket_name,
            file_params=file_params,
            dataflow_staging_location=dataflow_staging_location,
            dataflow_temp_location=dataflow_temp_location,
        )
        if restart_file:
            benchmark_load_file_generator.restart_incomplete_combination(
                restart_file)
        benchmark_load_file_generator.create_files()

    if create_benchmark_tables:
        benchmark_tables_processor = load_tables_processor.LoadTablesProcessor(
            benchmark_name=BENCHMARK_NAME,
            bq_project=bq_project_id,
            gcs_project=gcs_project_id,
            staging_project=staging_project_id,
            staging_dataset_id=staging_dataset_id,
            dataset_id=benchmark_dataset_id,
            bucket_name=bucket_name,
            results_table_name=results_table_name,
            results_table_dataset_id=results_dataset_id,
            duplicate_benchmark_tables=duplicate_benchmark_tables,
            file_params=file_params,
            bq_logs_dataset=bq_logs_dataset,
        )
        benchmark_tables_processor.create_benchmark_tables()