Beispiel #1
0
    def _setup(self):
        """Download data and checkout git repository."""

        # Acticate gcloud service
        start_time = time.time()
        utils.setup_python_path(self.site_packages_dir,
                                self.config.python_path_str)
        utils.active_gcloud_service(self.config.gcloud_key_file_url, self.workspace_dir)  # pylint: disable=line-too-long
        utils.make_dir_if_not_exist(self.root_output_dir)
        self.benchmark_execution_time['activate_gcloud_service'] = time.time() - start_time  # pylint: disable=line-too-long

        # Download data
        start_time = time.time()
        utils.download_data(utils.parse_data_downloads_str(self.config.root_data_dir, self.config.gcs_downloads_str))  # pylint: disable=line-too-long
        utils.download_data(utils.parse_data_downloads_str(self.config.root_data_dir, self.config.data_downloads_str))  # pylint: disable=line-too-long
        self.benchmark_execution_time['download_data'] = time.time(
        ) - start_time

        # Checkout git repositories
        start_time = time.time()
        site_package_info = utils.checkout_git_repos(
            self.config.get_git_repos(self.site_packages_dir),
            self.config.force_update)
        self.benchmark_execution_time['checkout_repository'] = time.time() - start_time  # pylint: disable=line-too-long

        self.stream_handler = logging.StreamHandler(sys.stdout)
        self.stream_handler.setFormatter(
            logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
        logging.getLogger().addHandler(self.stream_handler)
        return site_package_info
Beispiel #2
0
  def _setup(self):
    utils.setup_python_path(self.site_packages_dir, config.python_path_str)
    utils.active_gcloud_service(self.auth_token_path)
    utils.make_dir_if_not_exist(self.output_root_dir)

    self.streamHandler = logging.StreamHandler(sys.stdout)
    self.streamHandler.setFormatter(
        logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
    logging.getLogger().addHandler(self.streamHandler)
    def _setup(self):
        """Download data and checkout git repository."""

        # Acticate gcloud service
        start_time = time.time()
        utils.setup_python_path(self.site_packages_dir,
                                self.config.python_path_str)
        utils.active_gcloud_service(self.config.gcloud_key_file_url,
                                    self.workspace_dir)
        utils.make_dir_if_not_exist(self.root_output_dir)
        self.benchmark_execution_time['activate_gcloud_service'] = (
            time.time() - start_time)

        # Download data
        start_time = time.time()
        utils.download_data(
            utils.parse_data_downloads_str(self.config.root_data_dir,
                                           self.config.gcs_downloads_str))
        utils.download_data(
            utils.parse_data_downloads_str(self.config.root_data_dir,
                                           self.config.data_downloads_str))
        self.benchmark_execution_time['download_data'] = time.time(
        ) - start_time

        # Checkout git repositories
        start_time = time.time()
        site_package_info = utils.checkout_git_repos(
            self.config.get_git_repos(self.site_packages_dir),
            self.config.use_cached_site_packages)
        self.benchmark_execution_time['checkout_repository'] = (time.time() -
                                                                start_time)

        # Start cloud TPU.
        if self.config.tpu_parameters is not None:
            start_time = time.time()
            utils.setup_tpu(self.config.tpu_parameters)
            tpu_info = tpu_runtime_utils.configure_tpu(
                self.config.tpu_parameters)
            site_package_info['tpu_version'] = tpu_info
            self.benchmark_execution_time['start_tpu'] = time.time(
            ) - start_time

        self.stream_handler = logging.StreamHandler(sys.stdout)
        self.stream_handler.setFormatter(
            logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
        logging.getLogger().addHandler(self.stream_handler)
        return site_package_info
Beispiel #4
0
    def _setup(self):
        """Download data and checkout git repository."""
        # Set up the raid array.
        start_time = time.time()
        device_utils.create_drive_from_devices('/data',
                                               self.config.gce_nvme_raid_str)
        self.benchmark_execution_time['create_drive'] = time.time(
        ) - start_time

        start_time = time.time()
        utils.download_from_gcs([{
            'gcs_url':
            'gs://tf-performance/auth_tokens',
            'local_path':
            os.path.join(self.workspace_dir, 'auth_tokens')
        }])
        self.benchmark_execution_time['download_token'] = time.time(
        ) - start_time

        # Acticate gcloud service
        start_time = time.time()
        utils.setup_python_path(self.site_packages_dir,
                                self.config.python_path_str)
        utils.active_gcloud_service(self.auth_token_path)
        utils.make_dir_if_not_exist(self.output_root_dir)
        self.benchmark_execution_time['activate_gcloud_service'] = time.time(
        ) - start_time

        # Download data
        start_time = time.time()
        utils.download_from_gcs(self.config.get_gcs_downloads('/data'))
        self.benchmark_execution_time['download_data'] = time.time(
        ) - start_time

        # Checkout git repositories
        start_time = time.time()
        site_package_info = utils.checkout_git_repos(
            self.config.get_git_repos(self.site_packages_dir),
            self.config.force_update)
        self.benchmark_execution_time['checkout_repository'] = time.time(
        ) - start_time

        self.stream_handler = logging.StreamHandler(sys.stdout)
        self.stream_handler.setFormatter(
            logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
        logging.getLogger().addHandler(self.stream_handler)
        return site_package_info
Beispiel #5
0
def _start_profiler(output_dir):
  """Start profiler.

  Args:
    output_dir: log directory to place the profiler data
  """
  import tensorflow as tf  # pylint: disable=g-import-not-at-top

  profiler_data_dir = os.path.join(output_dir, 'profiler_data')
  utils.make_dir_if_not_exist(profiler_data_dir)
  logging.info('Starting TensorFlow profiler and saving data to dir %s',
                 profiler_data_dir)
  try:
    tf.profiler.experimental.start(profiler_data_dir)
    logging.info('Started TensorFlow profiler')
  except Exception:  # pylint: disable=broad-except
    logging.error('TensorFlow profiler failed to start due to error:\n %s',
                  traceback.format_exc())
def _stop_and_save_profiler(output_dir):
    """Stop profiler and save profiler data.

  Args:
    output_dir: log directory to place the profiler data
  """

    from tensorflow.python.eager import profiler  # pylint: disable=g-import-not-at-top

    try:
        profiler_data_dir = os.path.join(output_dir, 'profiler_data')
        logging.info('Stopping Tensorflow profiler and saving data to dir %s', profiler_data_dir)  # pylint: disable=line-too-long
        utils.make_dir_if_not_exist(profiler_data_dir)
        result = profiler.stop()
        with open(os.path.join(profiler_data_dir, 'local.trace'), 'wb') as f:
            f.write(result)
        logging.info('Stopped Tensorflow profiler.')
    except Exception:  # pylint: disable=W0703
        logging.error('Tensorflow profiler failed to stop due to error:\n %s',
                      traceback.format_exc())
Beispiel #7
0
  def run_benchmark(self):
    """Run benchmark."""
    for benchmark_method in self._get_benchmark_methods():
      try:
        execution_id = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
        output_dir = os.path.join(self.output_root_dir, execution_id)
        utils.make_dir_if_not_exist(output_dir)

        # Setup per-method file logger
        filehandler = logging.FileHandler(
            filename=os.path.join(output_dir, 'perfzero.log'), mode='w')
        filehandler.setFormatter(
            logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
        logging.getLogger().addHandler(filehandler)

        class_instance = self._instantiate_benchmark_class(output_dir)
        benchmark_name = '{}.{}'.format(class_instance.__class__.__name__,
                                        benchmark_method)

        # tf.test.Benchmark.report_benchmark() will write benchmark results to
        # the file whose path is benchmark_result_file_path_prefix +
        # benchmark_name
        benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_')
        os.environ[
            'TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix
        benchmark_result_file_path = benchmark_result_file_path_prefix + benchmark_name

        # Run benchmark method
        logging.info('Start benchmark: %s', benchmark_name)
        getattr(class_instance, benchmark_method)()
        logging.info('End benchmark: %s', benchmark_name)
        # Read and upload benchmark results
        benchmark_result = utils.read_benchmark_result(
            benchmark_result_file_path)
        self._upload_execution_summary(benchmark_result, execution_id,
                                       output_dir)

      finally:
        logging.getLogger().removeHandler(filehandler)
def _run_internal(benchmark_method, harness_info, site_package_info,
                  root_output_dir, config, queue):
    """Run benchmark method and put result to the queue.

  Args:
    benchmark_method: Canonical path to the benchmark method
    harness_info: Description of the benchmark harness used in the benchmark
    site_package_info: Description of the site-package used in the benchmark
    root_output_dir: Directory under which to put the benchmark output
    config: An instance of perfzero_config
    queue: An interprocess queue to transfer benchmark result to the caller
  """
    start_timestamp = time.time()
    execution_timestamp = start_timestamp
    method_has_exception = False
    execution_id = (config.execution_id if config.execution_id else
                    datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f'))
    output_dir = os.path.join(root_output_dir, execution_id)
    if config.scratch_gcs_url:
        model_output_dir = os.path.join(config.scratch_gcs_url, execution_id)
    else:
        model_output_dir = output_dir
    utils.make_dir_if_not_exist(output_dir)
    benchmark_class, benchmark_method_name = benchmark_method.rsplit('.', 1)
    benchmark_class_name = benchmark_class.rsplit('.', 1)[1]

    tensorflow_profiler = TensorFlowProfiler(config.profiler_enabled_time_str,
                                             output_dir)
    process_info_tracker = ProcessInfoTracker(output_dir)
    process_info = None

    # Setup per-method file logger
    filehandler = logging.FileHandler(filename=os.path.join(
        output_dir, 'perfzero.log'),
                                      mode='w')
    filehandler.setFormatter(
        logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
    logging.getLogger().addHandler(filehandler)

    try:
        if config.tpu_parameters:
            tpu = config.tpu_parameters.get('name')
        else:
            tpu = None
        if config.perfzero_constructor_args:
            constructor_args = json.loads(config.perfzero_constructor_args)
        else:
            constructor_args = {}
        class_instance = utils.instantiate_benchmark_class(
            benchmark_class=benchmark_class,
            output_dir=model_output_dir,
            root_data_dir=config.root_data_dir,
            tpu=tpu,
            constructor_args=constructor_args)
        # tf.test.Benchmark.report_benchmark() writes results to a file with
        # path benchmark_result_file_path_prefix + benchmark_method
        benchmark_result_file_path_prefix = os.path.join(output_dir, 'proto_')
        os.environ[
            'TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix
        benchmark_result_file_path = '{}{}.{}'.format(
            benchmark_result_file_path_prefix, benchmark_class_name,
            benchmark_method_name)

        # Start background threads for profiler and system info tracker
        tensorflow_profiler.start()
        process_info_tracker.start()

        # Run benchmark method
        execution_timestamp = time.time()
        logging.info('Starting benchmark execution: %s', benchmark_method)
        getattr(class_instance, benchmark_method_name)()
        logging.info('Stopped benchmark: %s', benchmark_method)

        # Read and build benchmark results
        raw_benchmark_result = utils.read_benchmark_result(
            benchmark_result_file_path)
        # Explicitly overwrite the name to be the full path to benchmark method
        raw_benchmark_result['name'] = benchmark_method
    except Exception:  # pylint: disable=broad-except
        logging.error('Benchmark execution for %s failed due to error:\n %s',
                      benchmark_method, traceback.format_exc())
        method_has_exception = True
        raw_benchmark_result = {}
        raw_benchmark_result['name'] = benchmark_method
        raw_benchmark_result['wall_time'] = -1
        raw_benchmark_result['extras'] = {}
    finally:
        # Stop background threads for profiler and system info tracker
        process_info = process_info_tracker.stop()
        tensorflow_profiler.stop()

    upload_timestamp = time.time()
    benchmark_result = report_utils.build_benchmark_result(
        raw_benchmark_result, method_has_exception)
    execution_summary = report_utils.build_execution_summary(
        execution_timestamp, execution_id, config.ml_framework_build_label,
        config.execution_label, config.platform_name, config.system_name,
        config.output_gcs_url, benchmark_result, config.get_env_vars(),
        config.get_flags(), harness_info, site_package_info, process_info,
        method_has_exception)
    report_utils.upload_execution_summary(config.bigquery_project_name,
                                          config.bigquery_dataset_table_name,
                                          execution_summary)
    report_utils.execute_methods(config.result_upload_methods,
                                 execution_summary)
    logging.info('Benchmark execution for %s completed with summary:\n %s',
                 benchmark_method, json.dumps(execution_summary, indent=2))
    _set_file_contents(json.dumps(execution_summary, indent=2),
                       os.path.join(output_dir, 'perfzero_summary.json'))
    utils.maybe_upload_to_gcs(output_dir, config.output_gcs_url)
    logging.getLogger().removeHandler(filehandler)
    method_execution_time = {
        'class_initialization': execution_timestamp - start_timestamp,
        'method_execution': upload_timestamp - execution_timestamp,
        'log_upload': time.time() - upload_timestamp
    }

    if config.profiler_enabled_time_str:
        relative_output_dir = output_dir[output_dir.find('benchmark'):]
        print('\nExecute the command below to start tensorboard server using '
              'the collected profiler data:\ntensorboard --logdir={}\n\n'
              'Open localhost:6006 in your browser to access the Tensorbord '
              'GUI. Use ssh with port forwarding if tensorboard is running on '
              'a remote machine.\n'.format(relative_output_dir))

    queue.put((method_has_exception, method_execution_time,
               benchmark_result['succeeded'], output_dir))
Beispiel #9
0
    def run_benchmark(self):
        """Run benchmark."""
        site_package_info = self._setup()
        has_exception = False
        benchmark_success_results = {}
        benchmark_output_dirs = {}

        for benchmark_method in self._get_benchmark_methods():
            start_timestamp = time.time()
            execution_timestamp = start_timestamp
            method_has_exception = False
            execution_id = datetime.datetime.now().strftime(
                '%Y-%m-%d-%H-%M-%S-%f')
            output_dir = os.path.join(self.root_output_dir, execution_id)
            utils.make_dir_if_not_exist(output_dir)
            benchmark_output_dirs[benchmark_method] = output_dir
            benchmark_class, benchmark_method_name = benchmark_method.rsplit(
                '.', 1)
            benchmark_class_name = benchmark_class.rsplit('.', 1)[1]

            tensorflow_profiler = TensorFlowProfiler(
                self.config.profiler_enabled_time_str, output_dir)
            process_info_tracker = ProcessInfoTracker(output_dir)
            process_info = None

            # Setup per-method file logger
            filehandler = logging.FileHandler(filename=os.path.join(
                output_dir, 'perfzero.log'),
                                              mode='w')
            filehandler.setFormatter(
                logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
            logging.getLogger().addHandler(filehandler)

            try:
                class_instance = self._instantiate_benchmark_class(
                    benchmark_class, output_dir, self.config.root_data_dir)
                # tf.test.Benchmark.report_benchmark() writes results to a file with
                # path benchmark_result_file_path_prefix + benchmark_method
                benchmark_result_file_path_prefix = os.path.join(
                    output_dir, 'proto_')
                os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix  # pylint: disable=line-too-long
                benchmark_result_file_path = '{}{}.{}'.format(
                    benchmark_result_file_path_prefix, benchmark_class_name,
                    benchmark_method_name)

                # Start background threads for profiler and system info tracker
                tensorflow_profiler.start()
                process_info_tracker.start()

                # Run benchmark method
                execution_timestamp = time.time()
                logging.info('Starting benchmark execution: %s',
                             benchmark_method)
                getattr(class_instance, benchmark_method_name)()
                logging.info('Stopped benchmark: %s', benchmark_method)

                # Read and build benchmark results
                raw_benchmark_result = utils.read_benchmark_result(
                    benchmark_result_file_path)
                # Explicitly overwrite the name to be the full path to benchmark method
                raw_benchmark_result['name'] = benchmark_method
            except Exception:  # pylint: disable=broad-except
                logging.error(
                    'Benchmark execution for %s failed due to error:\n %s',
                    benchmark_method, traceback.format_exc())
                method_has_exception = True
                has_exception = True
                raw_benchmark_result = {}
                raw_benchmark_result['name'] = benchmark_method
                raw_benchmark_result['wall_time'] = -1
                raw_benchmark_result['extras'] = {}
            finally:
                # Stop background threads for profiler and system info tracker
                process_info = process_info_tracker.stop()
                tensorflow_profiler.stop()

            upload_timestamp = time.time()
            benchmark_result = report_utils.build_benchmark_result(
                raw_benchmark_result, method_has_exception)
            benchmark_success_results[benchmark_method] = benchmark_result['succeeded']  # pylint: disable=line-too-long
            execution_summary = report_utils.build_execution_summary(
                execution_timestamp, execution_id,
                self.config.ml_framework_build_label,
                self.config.execution_label, self.config.platform_name,
                self.config.system_name, self.config.output_gcs_url,
                benchmark_result, self.config.get_env_vars(),
                self.config.get_flags(), site_package_info, process_info,
                method_has_exception)
            report_utils.upload_execution_summary(
                self.config.bigquery_project_name,
                self.config.bigquery_dataset_table_name, execution_summary)
            logging.info(
                'Benchmark execution for %s completed with summary:\n %s',
                benchmark_method, json.dumps(execution_summary, indent=2))
            utils.maybe_upload_to_gcs(output_dir, self.config.output_gcs_url)
            logging.getLogger().removeHandler(filehandler)
            self.benchmark_execution_time[benchmark_method] = {
                'class_initialization': execution_timestamp - start_timestamp,
                'method_execution': upload_timestamp - execution_timestamp,
                'log_upload': time.time() - upload_timestamp
            }

            if self.config.profiler_enabled_time_str:
                relative_output_dir = output_dir[output_dir.find('benchmark'):]
                print(
                    '\nExecute the command below to start tensorboard server using '
                    'the collected profiler data:\ntensorboard --logdir={}\n\n'
                    'Open localhost:6006 in your browser to access the Tensorbord '
                    'GUI. Use ssh with port forwarding if tensorboard is running on '
                    'a remote machine.\n'.format(relative_output_dir))

        print('Benchmark execution time in seconds by operation:\n {}'.format(
            json.dumps(self.benchmark_execution_time, indent=2)))
        print('Benchmark success results:\n{}'.format(
            json.dumps(benchmark_success_results, indent=2)))
        print('Benchmark local output directories:\n{}'.format(
            json.dumps(benchmark_output_dirs, indent=2)))
        if has_exception:
            sys.exit(1)
Beispiel #10
0
    def run_benchmark(self):
        """Run benchmark."""
        site_package_info = self._setup()
        has_exception = False
        benchmark_success_results = {}

        for benchmark_method in self._get_benchmark_methods():
            start_timestamp = time.time()
            method_has_exception = False
            execution_id = datetime.datetime.now().strftime(
                '%Y-%m-%d-%H-%M-%S-%f')
            execution_timestamp = time.time()
            output_dir = os.path.join(self.root_output_dir, execution_id)
            utils.make_dir_if_not_exist(output_dir)
            benchmark_class, benchmark_method_name = benchmark_method.rsplit(
                '.', 1)
            benchmark_class_name = benchmark_class.rsplit('.', 1)[1]

            # Setup per-method file logger
            filehandler = logging.FileHandler(filename=os.path.join(
                output_dir, 'perfzero.log'),
                                              mode='w')
            filehandler.setFormatter(
                logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
            logging.getLogger().addHandler(filehandler)

            try:
                class_instance = self._instantiate_benchmark_class(
                    benchmark_class, output_dir)
                # tf.test.Benchmark.report_benchmark() writes results to a file with
                # path benchmark_result_file_path_prefix + benchmark_method
                benchmark_result_file_path_prefix = os.path.join(
                    output_dir, 'proto_')
                os.environ['TEST_REPORT_FILE_PREFIX'] = benchmark_result_file_path_prefix  # pylint: disable=line-too-long
                benchmark_result_file_path = '{}{}.{}'.format(
                    benchmark_result_file_path_prefix, benchmark_class_name,
                    benchmark_method_name)
                # Run benchmark method
                logging.info('Start benchmark: %s', benchmark_method)
                getattr(class_instance, benchmark_method_name)()
                logging.info('End benchmark: %s', benchmark_method)
                # Read and build benchmark results
                raw_benchmark_result = utils.read_benchmark_result(benchmark_result_file_path)  # pylint: disable=line-too-long
                # Explicitly overwrite the name to be the full path to benchmark method
                raw_benchmark_result['name'] = benchmark_method
            except Exception:  # pylint: disable=W0703
                logging.error(
                    'Benchmark execution for %s failed due to error:\n %s',
                    benchmark_method, traceback.format_exc())
                method_has_exception = True
                has_exception = True
                raw_benchmark_result = {}
                raw_benchmark_result['name'] = benchmark_method
                raw_benchmark_result['wall_time'] = -1
                raw_benchmark_result['extras'] = {}

            upload_timestamp = time.time()
            benchmark_result = report_utils.build_benchmark_result(
                raw_benchmark_result, method_has_exception)
            benchmark_success_results[benchmark_method] = benchmark_result['succeeded']  # pylint: disable=line-too-long
            execution_summary = report_utils.build_execution_summary(
                execution_timestamp, execution_id,
                self.config.ml_framework_build_label_str,
                self.config.execution_label_str, self.config.platform_name_str,
                self.config.system_name_str, self.config.output_gcs_url_str,
                benchmark_result, self.config.get_env_vars(),
                self.config.get_flags(), site_package_info,
                method_has_exception)
            report_utils.upload_execution_summary(
                self.config.bigquery_project_name_str,
                self.config.bigquery_dataset_table_name_str, execution_summary)
            logging.info(
                'Benchmark execution for %s completed with summary:\n %s',
                benchmark_method, json.dumps(execution_summary, indent=2))
            utils.maybe_upload_to_gcs(output_dir,
                                      self.config.output_gcs_url_str)
            logging.getLogger().removeHandler(filehandler)
            self.benchmark_execution_time[benchmark_method] = {}
            self.benchmark_execution_time[benchmark_method]['benchmark_time'] = upload_timestamp - start_timestamp  # pylint: disable=line-too-long
            self.benchmark_execution_time[benchmark_method]['upload_time'] = time.time() - upload_timestamp  # pylint: disable=line-too-long

        print('Benchmark execution time in seconds by operation:\n {}'.format(
            json.dumps(self.benchmark_execution_time, indent=2)))
        print('benchmark success results:\n{}'.format(
            json.dumps(benchmark_success_results, indent=2)))
        if has_exception:
            sys.exit(1)