def _setup(self): """Download data and checkout git repository.""" # Acticate gcloud service start_time = time.time() utils.setup_python_path(self.site_packages_dir, self.config.python_path_str) utils.active_gcloud_service(self.config.gcloud_key_file_url, self.workspace_dir) # pylint: disable=line-too-long utils.make_dir_if_not_exist(self.root_output_dir) self.benchmark_execution_time['activate_gcloud_service'] = time.time() - start_time # pylint: disable=line-too-long # Download data start_time = time.time() utils.download_data(utils.parse_data_downloads_str(self.config.root_data_dir, self.config.gcs_downloads_str)) # pylint: disable=line-too-long utils.download_data(utils.parse_data_downloads_str(self.config.root_data_dir, self.config.data_downloads_str)) # pylint: disable=line-too-long self.benchmark_execution_time['download_data'] = time.time( ) - start_time # Checkout git repositories start_time = time.time() site_package_info = utils.checkout_git_repos( self.config.get_git_repos(self.site_packages_dir), self.config.force_update) self.benchmark_execution_time['checkout_repository'] = time.time() - start_time # pylint: disable=line-too-long self.stream_handler = logging.StreamHandler(sys.stdout) self.stream_handler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(self.stream_handler) return site_package_info
def _setup(self): utils.setup_python_path(self.site_packages_dir, config.python_path_str) utils.active_gcloud_service(self.auth_token_path) utils.make_dir_if_not_exist(self.output_root_dir) self.streamHandler = logging.StreamHandler(sys.stdout) self.streamHandler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(self.streamHandler)
def _setup(self): """Download data and checkout git repository.""" # Acticate gcloud service start_time = time.time() utils.setup_python_path(self.site_packages_dir, self.config.python_path_str) utils.active_gcloud_service(self.config.gcloud_key_file_url, self.workspace_dir) utils.make_dir_if_not_exist(self.root_output_dir) self.benchmark_execution_time['activate_gcloud_service'] = ( time.time() - start_time) # Download data start_time = time.time() utils.download_data( utils.parse_data_downloads_str(self.config.root_data_dir, self.config.gcs_downloads_str)) utils.download_data( utils.parse_data_downloads_str(self.config.root_data_dir, self.config.data_downloads_str)) self.benchmark_execution_time['download_data'] = time.time( ) - start_time # Checkout git repositories start_time = time.time() site_package_info = utils.checkout_git_repos( self.config.get_git_repos(self.site_packages_dir), self.config.use_cached_site_packages) self.benchmark_execution_time['checkout_repository'] = (time.time() - start_time) # Start cloud TPU. if self.config.tpu_parameters is not None: start_time = time.time() utils.setup_tpu(self.config.tpu_parameters) tpu_info = tpu_runtime_utils.configure_tpu( self.config.tpu_parameters) site_package_info['tpu_version'] = tpu_info self.benchmark_execution_time['start_tpu'] = time.time( ) - start_time self.stream_handler = logging.StreamHandler(sys.stdout) self.stream_handler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(self.stream_handler) return site_package_info
def _setup(self): """Download data and checkout git repository.""" # Set up the raid array. start_time = time.time() device_utils.create_drive_from_devices('/data', self.config.gce_nvme_raid_str) self.benchmark_execution_time['create_drive'] = time.time( ) - start_time start_time = time.time() utils.download_from_gcs([{ 'gcs_url': 'gs://tf-performance/auth_tokens', 'local_path': os.path.join(self.workspace_dir, 'auth_tokens') }]) self.benchmark_execution_time['download_token'] = time.time( ) - start_time # Acticate gcloud service start_time = time.time() utils.setup_python_path(self.site_packages_dir, self.config.python_path_str) utils.active_gcloud_service(self.auth_token_path) utils.make_dir_if_not_exist(self.output_root_dir) self.benchmark_execution_time['activate_gcloud_service'] = time.time( ) - start_time # Download data start_time = time.time() utils.download_from_gcs(self.config.get_gcs_downloads('/data')) self.benchmark_execution_time['download_data'] = time.time( ) - start_time # Checkout git repositories start_time = time.time() site_package_info = utils.checkout_git_repos( self.config.get_git_repos(self.site_packages_dir), self.config.force_update) self.benchmark_execution_time['checkout_repository'] = time.time( ) - start_time self.stream_handler = logging.StreamHandler(sys.stdout) self.stream_handler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(self.stream_handler) return site_package_info
def _start_profiler(output_dir): """Start profiler. Args: output_dir: log directory to place the profiler data """ import tensorflow as tf # pylint: disable=g-import-not-at-top profiler_data_dir = os.path.join(output_dir, 'profiler_data') utils.make_dir_if_not_exist(profiler_data_dir) logging.info('Starting TensorFlow profiler and saving data to dir %s', profiler_data_dir) try: tf.profiler.experimental.start(profiler_data_dir) logging.info('Started TensorFlow profiler') except Exception: # pylint: disable=broad-except logging.error('TensorFlow profiler failed to start due to error:\n %s', traceback.format_exc())
def _stop_and_save_profiler(output_dir): """Stop profiler and save profiler data. Args: output_dir: log directory to place the profiler data """ from tensorflow.python.eager import profiler # pylint: disable=g-import-not-at-top try: profiler_data_dir = os.path.join(output_dir, 'profiler_data') logging.info('Stopping Tensorflow profiler and saving data to dir %s', profiler_data_dir) # pylint: disable=line-too-long utils.make_dir_if_not_exist(profiler_data_dir) result = profiler.stop() with open(os.path.join(profiler_data_dir, 'local.trace'), 'wb') as f: f.write(result) logging.info('Stopped Tensorflow profiler.') except Exception: # pylint: disable=W0703 logging.error('Tensorflow profiler failed to stop due to error:\n %s', traceback.format_exc())
def run_benchmark(self): """Run benchmark.""" for benchmark_method in self._get_benchmark_methods(): try: execution_id = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f') output_dir = os.path.join(self.output_root_dir, execution_id) utils.make_dir_if_not_exist(output_dir) # Setup per-method file logger filehandler = logging.FileHandler( filename=os.path.join(output_dir, 'perfzero.log'), mode='w') filehandler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(filehandler) class_instance = self._instantiate_benchmark_class(output_dir) benchmark_name = '{}.{}'.format(class_instance.__class__.__name__, benchmark_method) # tf.test.Benchmark.report_benchmark() will write benchmark results to # the file whose path is benchmark_result_file_path_prefix + # benchmark_name benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_') os.environ[ 'TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix benchmark_result_file_path = benchmark_result_file_path_prefix + benchmark_name # Run benchmark method logging.info('Start benchmark: %s', benchmark_name) getattr(class_instance, benchmark_method)() logging.info('End benchmark: %s', benchmark_name) # Read and upload benchmark results benchmark_result = utils.read_benchmark_result( benchmark_result_file_path) self._upload_execution_summary(benchmark_result, execution_id, output_dir) finally: logging.getLogger().removeHandler(filehandler)
def _run_internal(benchmark_method, harness_info, site_package_info, root_output_dir, config, queue): """Run benchmark method and put result to the queue. Args: benchmark_method: Canonical path to the benchmark method harness_info: Description of the benchmark harness used in the benchmark site_package_info: Description of the site-package used in the benchmark root_output_dir: Directory under which to put the benchmark output config: An instance of perfzero_config queue: An interprocess queue to transfer benchmark result to the caller """ start_timestamp = time.time() execution_timestamp = start_timestamp method_has_exception = False execution_id = (config.execution_id if config.execution_id else datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')) output_dir = os.path.join(root_output_dir, execution_id) if config.scratch_gcs_url: model_output_dir = os.path.join(config.scratch_gcs_url, execution_id) else: model_output_dir = output_dir utils.make_dir_if_not_exist(output_dir) benchmark_class, benchmark_method_name = benchmark_method.rsplit('.', 1) benchmark_class_name = benchmark_class.rsplit('.', 1)[1] tensorflow_profiler = TensorFlowProfiler(config.profiler_enabled_time_str, output_dir) process_info_tracker = ProcessInfoTracker(output_dir) process_info = None # Setup per-method file logger filehandler = logging.FileHandler(filename=os.path.join( output_dir, 'perfzero.log'), mode='w') filehandler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(filehandler) try: if config.tpu_parameters: tpu = config.tpu_parameters.get('name') else: tpu = None if config.perfzero_constructor_args: constructor_args = json.loads(config.perfzero_constructor_args) else: constructor_args = {} class_instance = utils.instantiate_benchmark_class( benchmark_class=benchmark_class, output_dir=model_output_dir, root_data_dir=config.root_data_dir, tpu=tpu, constructor_args=constructor_args) # tf.test.Benchmark.report_benchmark() writes results to a file with # path benchmark_result_file_path_prefix + benchmark_method benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_') os.environ[ 'TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix benchmark_result_file_path = '{}{}.{}'.format( benchmark_result_file_path_prefix, benchmark_class_name, benchmark_method_name) # Start background threads for profiler and system info tracker tensorflow_profiler.start() process_info_tracker.start() # Run benchmark method execution_timestamp = time.time() logging.info('Starting benchmark execution: %s', benchmark_method) getattr(class_instance, benchmark_method_name)() logging.info('Stopped benchmark: %s', benchmark_method) # Read and build benchmark results raw_benchmark_result = utils.read_benchmark_result( benchmark_result_file_path) # Explicitly overwrite the name to be the full path to benchmark method raw_benchmark_result['name'] = benchmark_method except Exception: # pylint: disable=broad-except logging.error('Benchmark execution for %s failed due to error:\n %s', benchmark_method, traceback.format_exc()) method_has_exception = True raw_benchmark_result = {} raw_benchmark_result['name'] = benchmark_method raw_benchmark_result['wall_time'] = -1 raw_benchmark_result['extras'] = {} finally: # Stop background threads for profiler and system info tracker process_info = process_info_tracker.stop() tensorflow_profiler.stop() upload_timestamp = time.time() benchmark_result = report_utils.build_benchmark_result( raw_benchmark_result, method_has_exception) execution_summary = report_utils.build_execution_summary( execution_timestamp, execution_id, config.ml_framework_build_label, config.execution_label, config.platform_name, config.system_name, config.output_gcs_url, benchmark_result, config.get_env_vars(), config.get_flags(), harness_info, site_package_info, process_info, method_has_exception) report_utils.upload_execution_summary(config.bigquery_project_name, config.bigquery_dataset_table_name, execution_summary) report_utils.execute_methods(config.result_upload_methods, execution_summary) logging.info('Benchmark execution for %s completed with summary:\n %s', benchmark_method, json.dumps(execution_summary, indent=2)) _set_file_contents(json.dumps(execution_summary, indent=2), os.path.join(output_dir, 'perfzero_summary.json')) utils.maybe_upload_to_gcs(output_dir, config.output_gcs_url) logging.getLogger().removeHandler(filehandler) method_execution_time = { 'class_initialization': execution_timestamp - start_timestamp, 'method_execution': upload_timestamp - execution_timestamp, 'log_upload': time.time() - upload_timestamp } if config.profiler_enabled_time_str: relative_output_dir = output_dir[output_dir.find('benchmark'):] print('\nExecute the command below to start tensorboard server using ' 'the collected profiler data:\ntensorboard --logdir={}\n\n' 'Open localhost:6006 in your browser to access the Tensorbord ' 'GUI. Use ssh with port forwarding if tensorboard is running on ' 'a remote machine.\n'.format(relative_output_dir)) queue.put((method_has_exception, method_execution_time, benchmark_result['succeeded'], output_dir))
def run_benchmark(self): """Run benchmark.""" site_package_info = self._setup() has_exception = False benchmark_success_results = {} benchmark_output_dirs = {} for benchmark_method in self._get_benchmark_methods(): start_timestamp = time.time() execution_timestamp = start_timestamp method_has_exception = False execution_id = datetime.datetime.now().strftime( '%Y-%m-%d-%H-%M-%S-%f') output_dir = os.path.join(self.root_output_dir, execution_id) utils.make_dir_if_not_exist(output_dir) benchmark_output_dirs[benchmark_method] = output_dir benchmark_class, benchmark_method_name = benchmark_method.rsplit( '.', 1) benchmark_class_name = benchmark_class.rsplit('.', 1)[1] tensorflow_profiler = TensorFlowProfiler( self.config.profiler_enabled_time_str, output_dir) process_info_tracker = ProcessInfoTracker(output_dir) process_info = None # Setup per-method file logger filehandler = logging.FileHandler(filename=os.path.join( output_dir, 'perfzero.log'), mode='w') filehandler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(filehandler) try: class_instance = self._instantiate_benchmark_class( benchmark_class, output_dir, self.config.root_data_dir) # tf.test.Benchmark.report_benchmark() writes results to a file with # path benchmark_result_file_path_prefix + benchmark_method benchmark_result_file_path_prefix = os.path.join( output_dir, 'proto_') os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix # pylint: disable=line-too-long benchmark_result_file_path = '{}{}.{}'.format( benchmark_result_file_path_prefix, benchmark_class_name, benchmark_method_name) # Start background threads for profiler and system info tracker tensorflow_profiler.start() process_info_tracker.start() # Run benchmark method execution_timestamp = time.time() logging.info('Starting benchmark execution: %s', benchmark_method) getattr(class_instance, benchmark_method_name)() logging.info('Stopped benchmark: %s', benchmark_method) # Read and build benchmark results raw_benchmark_result = utils.read_benchmark_result( benchmark_result_file_path) # Explicitly overwrite the name to be the full path to benchmark method raw_benchmark_result['name'] = benchmark_method except Exception: # pylint: disable=broad-except logging.error( 'Benchmark execution for %s failed due to error:\n %s', benchmark_method, traceback.format_exc()) method_has_exception = True has_exception = True raw_benchmark_result = {} raw_benchmark_result['name'] = benchmark_method raw_benchmark_result['wall_time'] = -1 raw_benchmark_result['extras'] = {} finally: # Stop background threads for profiler and system info tracker process_info = process_info_tracker.stop() tensorflow_profiler.stop() upload_timestamp = time.time() benchmark_result = report_utils.build_benchmark_result( raw_benchmark_result, method_has_exception) benchmark_success_results[benchmark_method] = benchmark_result['succeeded'] # pylint: disable=line-too-long execution_summary = report_utils.build_execution_summary( execution_timestamp, execution_id, self.config.ml_framework_build_label, self.config.execution_label, self.config.platform_name, self.config.system_name, self.config.output_gcs_url, benchmark_result, self.config.get_env_vars(), self.config.get_flags(), site_package_info, process_info, method_has_exception) report_utils.upload_execution_summary( self.config.bigquery_project_name, self.config.bigquery_dataset_table_name, execution_summary) logging.info( 'Benchmark execution for %s completed with summary:\n %s', benchmark_method, json.dumps(execution_summary, indent=2)) utils.maybe_upload_to_gcs(output_dir, self.config.output_gcs_url) logging.getLogger().removeHandler(filehandler) self.benchmark_execution_time[benchmark_method] = { 'class_initialization': execution_timestamp - start_timestamp, 'method_execution': upload_timestamp - execution_timestamp, 'log_upload': time.time() - upload_timestamp } if self.config.profiler_enabled_time_str: relative_output_dir = output_dir[output_dir.find('benchmark'):] print( '\nExecute the command below to start tensorboard server using ' 'the collected profiler data:\ntensorboard --logdir={}\n\n' 'Open localhost:6006 in your browser to access the Tensorbord ' 'GUI. Use ssh with port forwarding if tensorboard is running on ' 'a remote machine.\n'.format(relative_output_dir)) print('Benchmark execution time in seconds by operation:\n {}'.format( json.dumps(self.benchmark_execution_time, indent=2))) print('Benchmark success results:\n{}'.format( json.dumps(benchmark_success_results, indent=2))) print('Benchmark local output directories:\n{}'.format( json.dumps(benchmark_output_dirs, indent=2))) if has_exception: sys.exit(1)
def run_benchmark(self): """Run benchmark.""" site_package_info = self._setup() has_exception = False benchmark_success_results = {} for benchmark_method in self._get_benchmark_methods(): start_timestamp = time.time() method_has_exception = False execution_id = datetime.datetime.now().strftime( '%Y-%m-%d-%H-%M-%S-%f') execution_timestamp = time.time() output_dir = os.path.join(self.root_output_dir, execution_id) utils.make_dir_if_not_exist(output_dir) benchmark_class, benchmark_method_name = benchmark_method.rsplit( '.', 1) benchmark_class_name = benchmark_class.rsplit('.', 1)[1] # Setup per-method file logger filehandler = logging.FileHandler(filename=os.path.join( output_dir, 'perfzero.log'), mode='w') filehandler.setFormatter( logging.Formatter('%(asctime)s %(levelname)s: %(message)s')) logging.getLogger().addHandler(filehandler) try: class_instance = self._instantiate_benchmark_class( benchmark_class, output_dir) # tf.test.Benchmark.report_benchmark() writes results to a file with # path benchmark_result_file_path_prefix + benchmark_method benchmark_result_file_path_prefix = os.path.join( output_dir, 'proto_') os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix # pylint: disable=line-too-long benchmark_result_file_path = '{}{}.{}'.format( benchmark_result_file_path_prefix, benchmark_class_name, benchmark_method_name) # Run benchmark method logging.info('Start benchmark: %s', benchmark_method) getattr(class_instance, benchmark_method_name)() logging.info('End benchmark: %s', benchmark_method) # Read and build benchmark results raw_benchmark_result = utils.read_benchmark_result(benchmark_result_file_path) # pylint: disable=line-too-long # Explicitly overwrite the name to be the full path to benchmark method raw_benchmark_result['name'] = benchmark_method except Exception: # pylint: disable=W0703 logging.error( 'Benchmark execution for %s failed due to error:\n %s', benchmark_method, traceback.format_exc()) method_has_exception = True has_exception = True raw_benchmark_result = {} raw_benchmark_result['name'] = benchmark_method raw_benchmark_result['wall_time'] = -1 raw_benchmark_result['extras'] = {} upload_timestamp = time.time() benchmark_result = report_utils.build_benchmark_result( raw_benchmark_result, method_has_exception) benchmark_success_results[benchmark_method] = benchmark_result['succeeded'] # pylint: disable=line-too-long execution_summary = report_utils.build_execution_summary( execution_timestamp, execution_id, self.config.ml_framework_build_label_str, self.config.execution_label_str, self.config.platform_name_str, self.config.system_name_str, self.config.output_gcs_url_str, benchmark_result, self.config.get_env_vars(), self.config.get_flags(), site_package_info, method_has_exception) report_utils.upload_execution_summary( self.config.bigquery_project_name_str, self.config.bigquery_dataset_table_name_str, execution_summary) logging.info( 'Benchmark execution for %s completed with summary:\n %s', benchmark_method, json.dumps(execution_summary, indent=2)) utils.maybe_upload_to_gcs(output_dir, self.config.output_gcs_url_str) logging.getLogger().removeHandler(filehandler) self.benchmark_execution_time[benchmark_method] = {} self.benchmark_execution_time[benchmark_method]['benchmark_time'] = upload_timestamp - start_timestamp # pylint: disable=line-too-long self.benchmark_execution_time[benchmark_method]['upload_time'] = time.time() - upload_timestamp # pylint: disable=line-too-long print('Benchmark execution time in seconds by operation:\n {}'.format( json.dumps(self.benchmark_execution_time, indent=2))) print('benchmark success results:\n{}'.format( json.dumps(benchmark_success_results, indent=2))) if has_exception: sys.exit(1)