Esempio n. 1
0
  def _upload_execution_summary(self, benchmark_result, execution_id,
                                output_dir):
    """Report results and upload artifacts."""
    # Upload benchmark ouput
    output_gcs_dir_with_uid = ''
    if not self.config.output_gcs_url_str:
      logging.info(
          'Skipped uploading output because output_gcs_url_str is not set.')
    elif not os.listdir(output_dir):
      logging.info(
          'Skipped uploading output because there is no file in directory %s',
          output_dir)
    else:
      output_gcs_dir_with_uid = '{}/{}/'.format(self.config.output_gcs_url_str,
                                                execution_id)
      utils.upload_to_gcs(output_dir, self.config.output_gcs_url_str)

    execution_summary = report_utils.build_execution_summary(
        execution_id, self.config.test_env_str, self.config.platform_name_str,
        self.config.system_name_str, output_gcs_dir_with_uid, benchmark_result)

    logging.info('Benchmark summary is %s',
                 json.dumps(execution_summary, indent=2))

    report_utils.upload_execution_summary(self.config.bigquery_project_name_str,
                                          self.config.bigquery_table_name_str,
                                          execution_summary)
def _run_internal(benchmark_method, harness_info, site_package_info,
                  root_output_dir, config, queue):
    """Run benchmark method and put result to the queue.

  Args:
    benchmark_method: Canonical path to the benchmark method
    harness_info: Description of the benchmark harness used in the benchmark
    site_package_info: Description of the site-package used in the benchmark
    root_output_dir: Directory under which to put the benchmark output
    config: An instance of perfzero_config
    queue: An interprocess queue to transfer benchmark result to the caller
  """
    start_timestamp = time.time()
    execution_timestamp = start_timestamp
    method_has_exception = False
    execution_id = (config.execution_id if config.execution_id else
                    datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f'))
    output_dir = os.path.join(root_output_dir, execution_id)
    if config.scratch_gcs_url:
        model_output_dir = os.path.join(config.scratch_gcs_url, execution_id)
    else:
        model_output_dir = output_dir
    utils.make_dir_if_not_exist(output_dir)
    benchmark_class, benchmark_method_name = benchmark_method.rsplit('.', 1)
    benchmark_class_name = benchmark_class.rsplit('.', 1)[1]

    tensorflow_profiler = TensorFlowProfiler(config.profiler_enabled_time_str,
                                             output_dir)
    process_info_tracker = ProcessInfoTracker(output_dir)
    process_info = None

    # Setup per-method file logger
    filehandler = logging.FileHandler(filename=os.path.join(
        output_dir, 'perfzero.log'),
                                      mode='w')
    filehandler.setFormatter(
        logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
    logging.getLogger().addHandler(filehandler)

    try:
        if config.tpu_parameters:
            tpu = config.tpu_parameters.get('name')
        else:
            tpu = None
        if config.perfzero_constructor_args:
            constructor_args = json.loads(config.perfzero_constructor_args)
        else:
            constructor_args = {}
        class_instance = utils.instantiate_benchmark_class(
            benchmark_class=benchmark_class,
            output_dir=model_output_dir,
            root_data_dir=config.root_data_dir,
            tpu=tpu,
            constructor_args=constructor_args)
        # tf.test.Benchmark.report_benchmark() writes results to a file with
        # path benchmark_result_file_path_prefix + benchmark_method
        benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_')
        os.environ[
            'TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix
        benchmark_result_file_path = '{}{}.{}'.format(
            benchmark_result_file_path_prefix, benchmark_class_name,
            benchmark_method_name)

        # Start background threads for profiler and system info tracker
        tensorflow_profiler.start()
        process_info_tracker.start()

        # Run benchmark method
        execution_timestamp = time.time()
        logging.info('Starting benchmark execution: %s', benchmark_method)
        getattr(class_instance, benchmark_method_name)()
        logging.info('Stopped benchmark: %s', benchmark_method)

        # Read and build benchmark results
        raw_benchmark_result = utils.read_benchmark_result(
            benchmark_result_file_path)
        # Explicitly overwrite the name to be the full path to benchmark method
        raw_benchmark_result['name'] = benchmark_method
    except Exception:  # pylint: disable=broad-except
        logging.error('Benchmark execution for %s failed due to error:\n %s',
                      benchmark_method, traceback.format_exc())
        method_has_exception = True
        raw_benchmark_result = {}
        raw_benchmark_result['name'] = benchmark_method
        raw_benchmark_result['wall_time'] = -1
        raw_benchmark_result['extras'] = {}
    finally:
        # Stop background threads for profiler and system info tracker
        process_info = process_info_tracker.stop()
        tensorflow_profiler.stop()

    upload_timestamp = time.time()
    benchmark_result = report_utils.build_benchmark_result(
        raw_benchmark_result, method_has_exception)
    execution_summary = report_utils.build_execution_summary(
        execution_timestamp, execution_id, config.ml_framework_build_label,
        config.execution_label, config.platform_name, config.system_name,
        config.output_gcs_url, benchmark_result, config.get_env_vars(),
        config.get_flags(), harness_info, site_package_info, process_info,
        method_has_exception)
    report_utils.upload_execution_summary(config.bigquery_project_name,
                                          config.bigquery_dataset_table_name,
                                          execution_summary)
    report_utils.execute_methods(config.result_upload_methods,
                                 execution_summary)
    logging.info('Benchmark execution for %s completed with summary:\n %s',
                 benchmark_method, json.dumps(execution_summary, indent=2))
    _set_file_contents(json.dumps(execution_summary, indent=2),
                       os.path.join(output_dir, 'perfzero_summary.json'))
    utils.maybe_upload_to_gcs(output_dir, config.output_gcs_url)
    logging.getLogger().removeHandler(filehandler)
    method_execution_time = {
        'class_initialization': execution_timestamp - start_timestamp,
        'method_execution': upload_timestamp - execution_timestamp,
        'log_upload': time.time() - upload_timestamp
    }

    if config.profiler_enabled_time_str:
        relative_output_dir = output_dir[output_dir.find('benchmark'):]
        print('\nExecute the command below to start tensorboard server using '
              'the collected profiler data:\ntensorboard --logdir={}\n\n'
              'Open localhost:6006 in your browser to access the Tensorbord '
              'GUI. Use ssh with port forwarding if tensorboard is running on '
              'a remote machine.\n'.format(relative_output_dir))

    queue.put((method_has_exception, method_execution_time,
               benchmark_result['succeeded'], output_dir))
Esempio n. 3
0
    def run_benchmark(self):
        """Run benchmark."""
        site_package_info = self._setup()
        has_exception = False
        benchmark_success_results = {}
        benchmark_output_dirs = {}

        for benchmark_method in self._get_benchmark_methods():
            start_timestamp = time.time()
            execution_timestamp = start_timestamp
            method_has_exception = False
            execution_id = datetime.datetime.now().strftime(
                '%Y-%m-%d-%H-%M-%S-%f')
            output_dir = os.path.join(self.root_output_dir, execution_id)
            utils.make_dir_if_not_exist(output_dir)
            benchmark_output_dirs[benchmark_method] = output_dir
            benchmark_class, benchmark_method_name = benchmark_method.rsplit(
                '.', 1)
            benchmark_class_name = benchmark_class.rsplit('.', 1)[1]

            tensorflow_profiler = TensorFlowProfiler(
                self.config.profiler_enabled_time_str, output_dir)
            process_info_tracker = ProcessInfoTracker(output_dir)
            process_info = None

            # Setup per-method file logger
            filehandler = logging.FileHandler(filename=os.path.join(
                output_dir, 'perfzero.log'),
                                              mode='w')
            filehandler.setFormatter(
                logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
            logging.getLogger().addHandler(filehandler)

            try:
                class_instance = self._instantiate_benchmark_class(
                    benchmark_class, output_dir, self.config.root_data_dir)
                # tf.test.Benchmark.report_benchmark() writes results to a file with
                # path benchmark_result_file_path_prefix + benchmark_method
                benchmark_result_file_path_prefix = os.path.join(
                    output_dir, 'proto_')
                os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix  # pylint: disable=line-too-long
                benchmark_result_file_path = '{}{}.{}'.format(
                    benchmark_result_file_path_prefix, benchmark_class_name,
                    benchmark_method_name)

                # Start background threads for profiler and system info tracker
                tensorflow_profiler.start()
                process_info_tracker.start()

                # Run benchmark method
                execution_timestamp = time.time()
                logging.info('Starting benchmark execution: %s',
                             benchmark_method)
                getattr(class_instance, benchmark_method_name)()
                logging.info('Stopped benchmark: %s', benchmark_method)

                # Read and build benchmark results
                raw_benchmark_result = utils.read_benchmark_result(
                    benchmark_result_file_path)
                # Explicitly overwrite the name to be the full path to benchmark method
                raw_benchmark_result['name'] = benchmark_method
            except Exception:  # pylint: disable=broad-except
                logging.error(
                    'Benchmark execution for %s failed due to error:\n %s',
                    benchmark_method, traceback.format_exc())
                method_has_exception = True
                has_exception = True
                raw_benchmark_result = {}
                raw_benchmark_result['name'] = benchmark_method
                raw_benchmark_result['wall_time'] = -1
                raw_benchmark_result['extras'] = {}
            finally:
                # Stop background threads for profiler and system info tracker
                process_info = process_info_tracker.stop()
                tensorflow_profiler.stop()

            upload_timestamp = time.time()
            benchmark_result = report_utils.build_benchmark_result(
                raw_benchmark_result, method_has_exception)
            benchmark_success_results[benchmark_method] = benchmark_result['succeeded']  # pylint: disable=line-too-long
            execution_summary = report_utils.build_execution_summary(
                execution_timestamp, execution_id,
                self.config.ml_framework_build_label,
                self.config.execution_label, self.config.platform_name,
                self.config.system_name, self.config.output_gcs_url,
                benchmark_result, self.config.get_env_vars(),
                self.config.get_flags(), site_package_info, process_info,
                method_has_exception)
            report_utils.upload_execution_summary(
                self.config.bigquery_project_name,
                self.config.bigquery_dataset_table_name, execution_summary)
            logging.info(
                'Benchmark execution for %s completed with summary:\n %s',
                benchmark_method, json.dumps(execution_summary, indent=2))
            utils.maybe_upload_to_gcs(output_dir, self.config.output_gcs_url)
            logging.getLogger().removeHandler(filehandler)
            self.benchmark_execution_time[benchmark_method] = {
                'class_initialization': execution_timestamp - start_timestamp,
                'method_execution': upload_timestamp - execution_timestamp,
                'log_upload': time.time() - upload_timestamp
            }

            if self.config.profiler_enabled_time_str:
                relative_output_dir = output_dir[output_dir.find('benchmark'):]
                print(
                    '\nExecute the command below to start tensorboard server using '
                    'the collected profiler data:\ntensorboard --logdir={}\n\n'
                    'Open localhost:6006 in your browser to access the Tensorbord '
                    'GUI. Use ssh with port forwarding if tensorboard is running on '
                    'a remote machine.\n'.format(relative_output_dir))

        print('Benchmark execution time in seconds by operation:\n {}'.format(
            json.dumps(self.benchmark_execution_time, indent=2)))
        print('Benchmark success results:\n{}'.format(
            json.dumps(benchmark_success_results, indent=2)))
        print('Benchmark local output directories:\n{}'.format(
            json.dumps(benchmark_output_dirs, indent=2)))
        if has_exception:
            sys.exit(1)
Esempio n. 4
0
    def run_benchmark(self):
        """Run benchmark."""
        site_package_info = self._setup()
        has_exception = False
        benchmark_success_results = {}

        for benchmark_method in self._get_benchmark_methods():
            start_timestamp = time.time()
            method_has_exception = False
            execution_id = datetime.datetime.now().strftime(
                '%Y-%m-%d-%H-%M-%S-%f')
            execution_timestamp = time.time()
            output_dir = os.path.join(self.root_output_dir, execution_id)
            utils.make_dir_if_not_exist(output_dir)
            benchmark_class, benchmark_method_name = benchmark_method.rsplit(
                '.', 1)
            benchmark_class_name = benchmark_class.rsplit('.', 1)[1]

            # Setup per-method file logger
            filehandler = logging.FileHandler(filename=os.path.join(
                output_dir, 'perfzero.log'),
                                              mode='w')
            filehandler.setFormatter(
                logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
            logging.getLogger().addHandler(filehandler)

            try:
                class_instance = self._instantiate_benchmark_class(
                    benchmark_class, output_dir)
                # tf.test.Benchmark.report_benchmark() writes results to a file with
                # path benchmark_result_file_path_prefix + benchmark_method
                benchmark_result_file_path_prefix = os.path.join(
                    output_dir, 'proto_')
                os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix  # pylint: disable=line-too-long
                benchmark_result_file_path = '{}{}.{}'.format(
                    benchmark_result_file_path_prefix, benchmark_class_name,
                    benchmark_method_name)
                # Run benchmark method
                logging.info('Start benchmark: %s', benchmark_method)
                getattr(class_instance, benchmark_method_name)()
                logging.info('End benchmark: %s', benchmark_method)
                # Read and build benchmark results
                raw_benchmark_result = utils.read_benchmark_result(benchmark_result_file_path)  # pylint: disable=line-too-long
                # Explicitly overwrite the name to be the full path to benchmark method
                raw_benchmark_result['name'] = benchmark_method
            except Exception:  # pylint: disable=W0703
                logging.error(
                    'Benchmark execution for %s failed due to error:\n %s',
                    benchmark_method, traceback.format_exc())
                method_has_exception = True
                has_exception = True
                raw_benchmark_result = {}
                raw_benchmark_result['name'] = benchmark_method
                raw_benchmark_result['wall_time'] = -1
                raw_benchmark_result['extras'] = {}

            upload_timestamp = time.time()
            benchmark_result = report_utils.build_benchmark_result(
                raw_benchmark_result, method_has_exception)
            benchmark_success_results[benchmark_method] = benchmark_result['succeeded']  # pylint: disable=line-too-long
            execution_summary = report_utils.build_execution_summary(
                execution_timestamp, execution_id,
                self.config.ml_framework_build_label_str,
                self.config.execution_label_str, self.config.platform_name_str,
                self.config.system_name_str, self.config.output_gcs_url_str,
                benchmark_result, self.config.get_env_vars(),
                self.config.get_flags(), site_package_info,
                method_has_exception)
            report_utils.upload_execution_summary(
                self.config.bigquery_project_name_str,
                self.config.bigquery_dataset_table_name_str, execution_summary)
            logging.info(
                'Benchmark execution for %s completed with summary:\n %s',
                benchmark_method, json.dumps(execution_summary, indent=2))
            utils.maybe_upload_to_gcs(output_dir,
                                      self.config.output_gcs_url_str)
            logging.getLogger().removeHandler(filehandler)
            self.benchmark_execution_time[benchmark_method] = {}
            self.benchmark_execution_time[benchmark_method]['benchmark_time'] = upload_timestamp - start_timestamp  # pylint: disable=line-too-long
            self.benchmark_execution_time[benchmark_method]['upload_time'] = time.time() - upload_timestamp  # pylint: disable=line-too-long

        print('Benchmark execution time in seconds by operation:\n {}'.format(
            json.dumps(self.benchmark_execution_time, indent=2)))
        print('benchmark success results:\n{}'.format(
            json.dumps(benchmark_success_results, indent=2)))
        if has_exception:
            sys.exit(1)