Esempio n. 1
0
 def run_with_xprof(self, enable_python_trace, run_benchmark, func,
                    num_iters_xprof, execution_mode, suid):
     if enable_python_trace:
         options = profiler.ProfilerOptions(python_tracer_level=1)
         logdir = os.path.join(flags.FLAGS.logdir, suid + "_with_python")
     else:
         options = profiler.ProfilerOptions(python_tracer_level=0)
         logdir = os.path.join(flags.FLAGS.logdir, suid)
     with profiler.Profile(logdir, options):
         total_time = run_benchmark(func, num_iters_xprof, execution_mode)
     us_per_example = float("{0:.3f}".format(total_time * 1e6 /
                                             num_iters_xprof))
     return logdir, us_per_example
Esempio n. 2
0
    def test_single_worker_sampling_mode(self):
        """Test single worker sampling mode."""
        def on_worker(port):
            logging.info('worker starting server on {}'.format(port))
            profiler.start_server(port)
            _, steps, train_ds, model = _model_setup()
            model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)

        port = portpicker.pick_unused_port()
        thread = threading.Thread(target=on_worker, args=(port, ))
        thread.start()
        # Request for 3 seconds of profile.
        duration_ms = 3000
        logdir = self.get_temp_dir()

        options = profiler.ProfilerOptions(
            host_tracer_level=2,
            python_tracer_level=0,
            device_tracer_level=1,
        )

        profiler_client.trace('localhost:{}'.format(port), logdir, duration_ms,
                              '', 3, options)
        thread.join(30)
        self._check_tools_pb_exist(logdir)
Esempio n. 3
0
def collect_profile(port: int, duration_in_ms: int, host: str,
                    log_dir: Optional[str], host_tracer_level: int,
                    device_tracer_level: int, python_tracer_level: int,
                    no_perfetto_link: bool):
    options = profiler.ProfilerOptions(
        host_tracer_level=host_tracer_level,
        device_tracer_level=device_tracer_level,
        python_tracer_level=python_tracer_level,
    )
    log_dir_ = pathlib.Path(
        log_dir if log_dir is not None else tempfile.mkdtemp())
    profiler_client.trace(f"{host}:{port}",
                          str(log_dir_),
                          duration_in_ms,
                          options=options)
    print(f"Dumped profiling information in: {log_dir_}")
    # The profiler dumps `xplane.pb` to the logging directory. To upload it to
    # the Perfetto trace viewer, we need to convert it to a `trace.json` file.
    # We do this by first finding the `xplane.pb` file, then passing it into
    # tensorflow_profile_plugin's `xplane` conversion function.
    curr_path = log_dir_.resolve()
    root_trace_folder = curr_path / "plugins" / "profile"
    trace_folders = [
        root_trace_folder / trace_folder
        for trace_folder in root_trace_folder.iterdir()
    ]
    latest_folder = max(trace_folders, key=os.path.getmtime)
    xplane = next(latest_folder.glob("*.xplane.pb"))
    result = convert.xspace_to_tool_data([xplane], "trace_viewer^", None)

    with gzip.open(str(latest_folder / "remote.trace.json.gz"), "wb") as fp:
        fp.write(result.encode("utf-8"))

    if not no_perfetto_link:
        jax._src.profiler._host_perfetto_trace_file(str(log_dir_))
Esempio n. 4
0
    def on_profile(port, logdir):
      # Request for 30 milliseconds of profile.
      duration_ms = 30

      options = profiler.ProfilerOptions(
          host_tracer_level=2,
          python_tracer_level=0,
          device_tracer_level=1,
      )

      profiler_client.trace('localhost:{}'.format(port), logdir, duration_ms,
                            '', 100, options)
Esempio n. 5
0
def run_with_xprof(self,
                   func,
                   num_iters_xprof=100,
                   enable_python_trace=True,
                   logdir='/tmp/layer_benchmark_xprof/'):
    suid = str(uuid.uuid4())
    if enable_python_trace:
        options = profiler.ProfilerOptions(python_tracer_level=1)
        logdir = os.path.join(logdir, str(uuid.uuid4()) + "_with_python")
    else:
        options = profiler.ProfilerOptions(python_tracer_level=0)
        logdir = os.path.join(logdir, suid)

    start = time.time()
    with profiler.Profile(logdir, options):
        for _ in range(num_iters_xprof):
            func()
    total_time = time.time() - start
    us_per_example = float("{0:.3f}".format(total_time * 1e6 /
                                            num_iters_xprof))
    return logdir, us_per_example
Esempio n. 6
0
    def test_context_manager_with_options(self):
        logdir = self.get_temp_dir()
        options = profiler.ProfilerOptions(host_tracer_level=3,
                                           python_tracer_level=1)
        with profiler.Profile(logdir, options):
            with trace.Trace('three_times_five'):
                three = constant_op.constant(3)
                five = constant_op.constant(5)
                product = three * five
            self.assertAllEqual(15, product)

        file_list = gfile.ListDirectory(logdir)
        self.assertEqual(len(file_list), 2)
Esempio n. 7
0
        def on_profile(port, logdir, worker_start):
            worker_start.wait()
            options = tf_profiler.ProfilerOptions(
                host_tracer_level=2,
                python_tracer_level=2,
                device_tracer_level=1,
                delay_ms=delay_ms,
            )

            # Request for 1000 milliseconds of profile.
            duration_ms = 1000
            profiler_client.trace('localhost:{}'.format(port), logdir,
                                  duration_ms, '', 1000, options)
            self.profile_done = True
Esempio n. 8
0
    def test_single_worker_programmatic_mode(self):
        """Test single worker programmatic mode."""
        logdir = self.get_temp_dir()

        options = profiler.ProfilerOptions(
            host_tracer_level=2,
            python_tracer_level=0,
            device_tracer_level=1,
        )
        profiler.start(logdir, options)
        _, steps, train_ds, model = _model_setup()
        model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
        profiler.stop()
        self._check_tools_pb_exist(logdir)
Esempio n. 9
0
 def testTrace_ProfileIdleServerWithOptions(self):
     test_port = portpicker.pick_unused_port()
     profiler.start_server(test_port)
     # Test the profilers are successfully started and connected to profiler
     # service on the worker. Since there is no op running, it is expected to
     # return UnavailableError with no trace events collected string.
     with self.assertRaises(errors.UnavailableError) as error:
         options = profiler.ProfilerOptions(host_tracer_level=3,
                                            device_tracer_level=0)
         profiler_client.trace('localhost:' + str(test_port),
                               self.get_temp_dir(),
                               duration_ms=10,
                               options=options)
     self.assertEqual('No trace event is collected', str(error.exception))
Esempio n. 10
0
 def trace(self):
     self.trace_dir = os.path.join(env['dir'], 'logs')
     os.makedirs(self.trace_dir, exist_ok=True)
     options = profiler.ProfilerOptions(
         host_tracer_level=self.monitoring_level)
     while self.alive:
         with self._lock:
             try:
                 profiler_client.trace(self.service_addr, self.trace_dir,
                                       self.duration_ms, self.workers_list,
                                       5, options)
             except KeyboardInterrupt:
                 self.alive = False
                 print('Closing Tracer')
                 sys.exit()
Esempio n. 11
0
    def capture_route(self, request):
        service_addr = request.args.get('service_addr')
        duration = int(request.args.get('duration', '1000'))
        is_tpu_name = request.args.get('is_tpu_name') == 'true'
        worker_list = request.args.get('worker_list')
        num_tracing_attempts = int(request.args.get('num_retry', '0')) + 1
        options = None
        try:
            options = profiler.ProfilerOptions(
                host_tracer_level=int(
                    request.args.get('host_tracer_level', '2')),
                device_tracer_level=int(
                    request.args.get('device_tracer_level', '1')),
                python_tracer_level=int(
                    request.args.get('python_tracer_level', '0')))
        except AttributeError:
            logger.warning(
                'ProfilerOptions are available after tensorflow 2.3')

        if is_tpu_name:
            try:
                tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                    service_addr)
                master_grpc_addr = tpu_cluster_resolver.get_master()
            except (ImportError, RuntimeError) as err:
                return respond({'error': err.message},
                               'application/json',
                               code=200)
            except (ValueError, TypeError):
                return respond(
                    {'error': 'no TPUs with the specified names exist.'},
                    'application/json',
                    code=200,
                )
            if not worker_list:
                worker_list = get_worker_list(tpu_cluster_resolver)
            # TPU cluster resolver always returns port 8470. Replace it with 8466
            # on which profiler service is running.
            master_ip = master_grpc_addr.replace('grpc://',
                                                 '').replace(':8470', '')
            service_addr = master_ip + ':8466'
            # Set the master TPU for streaming trace viewer.
            self.master_tpu_unsecure_channel = master_ip
        try:
            if options:
                profiler_client.trace(service_addr,
                                      self.logdir,
                                      duration,
                                      worker_list,
                                      num_tracing_attempts,
                                      options=options)
            else:
                profiler_client.trace(
                    service_addr,
                    self.logdir,
                    duration,
                    worker_list,
                    num_tracing_attempts,
                )
            return respond(
                {'result': 'Capture profile successfully. Please refresh.'},
                'application/json',
            )
        except tf.errors.UnavailableError:
            return respond(
                {'error': 'empty trace result.'},
                'application/json',
                code=200,
            )
        except Exception as e:  # pylint: disable=broad-except
            return respond(
                {'error': str(e)},
                'application/json',
                code=200,
            )
Esempio n. 12
0
def main(unused_argv=None):
    logging.set_verbosity(logging.INFO)
    tf_version = versions.__version__
    print('TensorFlow version %s detected' % tf_version)
    print('Welcome to the Cloud TPU Profiler v%s' %
          profiler_version.__version__)

    if LooseVersion(tf_version) < LooseVersion('2.2.0'):
        sys.exit('You must install tensorflow >= 2.2.0 to use this plugin.')

    if not FLAGS.service_addr and not FLAGS.tpu:
        sys.exit('You must specify either --service_addr or --tpu.')

    tpu_cluster_resolver = None
    if FLAGS.service_addr:
        if FLAGS.tpu:
            logging.warn('Both --service_addr and --tpu are set. Ignoring '
                         '--tpu and using --service_addr.')
        service_addr = FLAGS.service_addr
    else:
        try:
            tpu_cluster_resolver = (resolver.TPUClusterResolver(
                [FLAGS.tpu], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
            service_addr = tpu_cluster_resolver.get_master()
        except (ValueError, TypeError):
            sys.exit(
                'Failed to find TPU %s in zone %s project %s. You may use '
                '--tpu_zone and --gcp_project to specify the zone and project of'
                ' your TPU.' % (FLAGS.tpu, FLAGS.tpu_zone, FLAGS.gcp_project))
    service_addr = service_addr.replace('grpc://',
                                        '').replace(':8470', ':8466')

    workers_list = ''
    if FLAGS.workers_list is not None:
        workers_list = FLAGS.workers_list
    elif tpu_cluster_resolver is not None:
        workers_list = get_workers_list(tpu_cluster_resolver)

    # If profiling duration was not set by user or set to a non-positive value,
    # we set it to a default value of 1000ms.
    duration_ms = FLAGS.duration_ms if FLAGS.duration_ms > 0 else 1000

    if FLAGS.monitoring_level > 0:
        print('Since monitoring level is provided, profile', service_addr,
              ' for ', FLAGS.duration_ms, ' ms and show metrics for ',
              FLAGS.num_queries, ' time(s).')
        monitoring_helper(service_addr, duration_ms, FLAGS.monitoring_level,
                          FLAGS.num_queries)
    else:
        if not FLAGS.logdir:
            sys.exit('You must specify either --logdir or --monitoring_level.')

        if not gfile.Exists(FLAGS.logdir):
            gfile.MakeDirs(FLAGS.logdir)

        try:
            if LooseVersion(tf_version) < LooseVersion('2.3.0'):
                profiler_client.trace(service_addr,
                                      os.path.expanduser(FLAGS.logdir),
                                      duration_ms, workers_list,
                                      FLAGS.num_tracing_attempts)
            else:
                options = profiler.ProfilerOptions(
                    host_tracer_level=FLAGS.host_tracer_level)
                profiler_client.trace(service_addr,
                                      os.path.expanduser(FLAGS.logdir),
                                      duration_ms, workers_list,
                                      FLAGS.num_tracing_attempts, options)
        except errors.UnavailableError:
            sys.exit(0)
 def testStartTracing_ProcessInvalidAddressWithOptions(self):
   with self.assertRaises(errors.UnavailableError):
     options = profiler.ProfilerOptions(
         host_tracer_level=3, device_tracer_level=0)
     profiler_client.trace(
         'localhost:6006', tempfile.mkdtemp(), 2000, options=options)