Beispiel #1
0
 def capture_route(self, request):
     service_addr = request.args.get('service_addr')
     try:
         duration = int(request.args.get('duration'))
     except TypeError:
         return http_util.Respond(request,
                                  'Invalid duration',
                                  'text/plain',
                                  code=400)
     is_tpu_name = request.args.get('is_tpu_name') == 'true'
     if is_tpu_name:
         tpu_cluster_resolver = (
             tf.distribute.cluster_resolver.TPUClusterResolver(
                 [service_addr]))
         service_addr = tpu_cluster_resolver.get_master()
         # TPU cluster resolver always returns port 8470. Replace it with 8466
         # on which profiler service is running.
         service_addr = service_addr.replace('grpc://',
                                             '').replace(':8470', ':8466')
     try:
         profiler_client.start_tracing(service_addr, self.logdir, duration)
         return http_util.Respond(
             request,
             {'result': 'Capture profile successfully. Please refresh'},
             'application/json')
     except tf.errors.UnavailableError:
         return http_util.Respond(request,
                                  'Empty trace result',
                                  'text/plain',
                                  code=404)
def main(unused_argv=None):
    logging.set_verbosity(logging.INFO)
    tf_version = versions.__version__
    print('TensorFlow version %s detected' % tf_version)
    print('Welcome to the Cloud TPU Profiler v%s' %
          profiler_version.__version__)

    if LooseVersion(tf_version) < LooseVersion('1.14.0'):
        sys.exit('You must install tensorflow >= 1.14.0 to use this plugin.')

    if not FLAGS.service_addr and not FLAGS.tpu:
        sys.exit('You must specify either --service_addr or --tpu.')

    tpu_cluster_resolver = None
    if FLAGS.service_addr:
        if FLAGS.tpu:
            logging.warn('Both --service_addr and --tpu are set. Ignoring '
                         '--tpu and using --service_addr.')
        service_addr = FLAGS.service_addr
    else:
        try:
            tpu_cluster_resolver = (resolver.TPUClusterResolver(
                [FLAGS.tpu], zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
            service_addr = tpu_cluster_resolver.get_master()
        except (ValueError, TypeError):
            sys.exit(
                'Failed to find TPU %s in zone %s project %s. You may use '
                '--tpu_zone and --gcp_project to specify the zone and project of'
                ' your TPU.' % (FLAGS.tpu, FLAGS.tpu_zone, FLAGS.gcp_project))
    service_addr = service_addr.replace('grpc://',
                                        '').replace(':8470', ':8466')

    workers_list = ''
    if FLAGS.workers_list is not None:
        workers_list = FLAGS.workers_list
    elif tpu_cluster_resolver is not None:
        workers_list = get_workers_list(tpu_cluster_resolver)

    # If profiling duration was not set by user or set to a non-positive value,
    # we set it to a default value of 1000ms.
    duration_ms = FLAGS.duration_ms if FLAGS.duration_ms > 0 else 1000

    if FLAGS.monitoring_level > 0:
        print('Since monitoring level is provided, profile', service_addr,
              ' for ', FLAGS.duration_ms, ' ms and show metrics for ',
              FLAGS.num_queries, ' time(s).')
        monitoring_helper(service_addr, duration_ms, FLAGS.monitoring_level,
                          FLAGS.display_timestamp, FLAGS.num_queries)
    else:
        if not FLAGS.logdir:
            sys.exit('You must specify either --logdir or --monitoring_level.')
        try:
            profiler_client.start_tracing(service_addr,
                                          os.path.expanduser(FLAGS.logdir),
                                          duration_ms, workers_list,
                                          FLAGS.include_dataset_ops,
                                          FLAGS.num_tracing_attempts)
        except errors.UnavailableError:
            sys.exit(0)
Beispiel #3
0
    def capture_route(self, request):
        service_addr = request.args.get("service_addr")
        duration = int(request.args.get("duration", "1000"))
        is_tpu_name = request.args.get("is_tpu_name") == "true"
        worker_list = request.args.get("worker_list")
        include_dataset_ops = request.args.get("include_dataset_ops") == "true"
        num_tracing_attempts = int(request.args.get("num_retry", "0")) + 1

        if is_tpu_name:
            try:
                tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                    service_addr)
                master_grpc_addr = tpu_cluster_resolver.get_master()
            except (ImportError, RuntimeError) as err:
                return http_util.Respond(
                    request,
                    {"error": err.message},
                    "application/json",
                    code=200,
                )
            except (ValueError, TypeError):
                return http_util.Respond(
                    request,
                    {"error": "no TPUs with the specified names exist."},
                    "application/json",
                    code=200,
                )
            if not worker_list:
                worker_list = get_worker_list(tpu_cluster_resolver)
            # TPU cluster resolver always returns port 8470. Replace it with 8466
            # on which profiler service is running.
            master_ip = master_grpc_addr.replace("grpc://",
                                                 "").replace(":8470", "")
            service_addr = master_ip + ":8466"
            # Set the master TPU for streaming trace viewer.
            self.master_tpu_unsecure_channel = master_ip
        try:
            profiler_client.start_tracing(
                service_addr,
                self.logdir,
                duration,
                worker_list,
                include_dataset_ops,
                num_tracing_attempts,
            )
            return http_util.Respond(
                request,
                {"result": "Capture profile successfully. Please refresh."},
                "application/json",
            )
        except tf.errors.UnavailableError:
            return http_util.Respond(
                request,
                {"error": "empty trace result."},
                "application/json",
                code=200,
            )
Beispiel #4
0
    def capture_route(self, request):
        service_addr = request.args.get('service_addr')
        duration = int(request.args.get('duration', '1000'))
        is_tpu_name = request.args.get('is_tpu_name') == 'true'
        worker_list = request.args.get('worker_list')
        include_dataset_ops = request.args.get('include_dataset_ops') == 'true'
        num_tracing_attempts = int(request.args.get('num_retry', '0')) + 1

        if is_tpu_name:
            try:
                tpu_cluster_resolver = (tf.distribute.cluster_resolver.
                                        TPUClusterResolver(service_addr))
                master_grpc_addr = tpu_cluster_resolver.get_master()
            except (ImportError, RuntimeError) as err:
                return http_util.Respond(request, {'error': err.message},
                                         'application/json',
                                         code=200)
            except (ValueError, TypeError):
                return http_util.Respond(
                    request,
                    {'error': 'no TPUs with the specified names exist.'},
                    'application/json',
                    code=200)
            if not worker_list:
                worker_list = get_worker_list(tpu_cluster_resolver)
            # TPU cluster resolver always returns port 8470. Replace it with 8466
            # on which profiler service is running.
            master_ip = master_grpc_addr.replace('grpc://',
                                                 '').replace(':8470', '')
            service_addr = master_ip + ':8466'
            # Set the master TPU for streaming trace viewer.
            self.master_tpu_unsecure_channel = master_ip
        try:
            profiler_client.start_tracing(service_addr, self.logdir, duration,
                                          worker_list, include_dataset_ops,
                                          num_tracing_attempts)
            return http_util.Respond(
                request,
                {'result': 'Capture profile successfully. Please refresh.'},
                'application/json')
        except tf.errors.UnavailableError:
            return http_util.Respond(request, {'error': 'empty trace result.'},
                                     'application/json',
                                     code=200)
Beispiel #5
0
def ProfileSubProcess(service_addr, path, duration_ms, workers_list,
                      include_dataset_ops, num_tracing_attempts):
    try:
        profiler_client.start_tracing(
            service_addr=service_addr,
            logdir=path,
            duration_ms=duration_ms,
            worker_list=workers_list,
            include_dataset_ops=include_dataset_ops,
            num_tracing_attempts=num_tracing_attempts)
    except errors.UnavailableError:
        exit(1)
    except:
        exit(-1)

    # # pywrap_tensorflow.TFE_ProfilerClientStartTracing(service_addr, path, workers_list, include_dataset_ops, duration_ms, num_tracing_attempts)
    # except:
    #   print("Failed")
    #   exit(1)
    return
  def test_profiler_service_with_valid_trace_request(self):
    """Test integration with profiler service by sending tracing requests."""

    # Start model server
    model_path = self._GetSavedModelBundlePath()
    _, grpc_addr, rest_addr = TensorflowModelServerTest.RunServer(
        'default', model_path)

    # Prepare predict request
    url = 'http://{}/v1/models/default:predict'.format(rest_addr)
    json_req = '{"instances": [2.0, 3.0, 4.0]}'

    # In a subprocess, send a REST predict request every second for 3 seconds
    exec_command = ("wget {} --content-on-error=on -O- --post-data  '{}' "
                    "--header='Content-Type:application/json'").format(
                        url, json_req)
    repeat_command = 'for n in {{1..3}}; do {} & sleep 1; done;'.format(
        exec_command)
    proc = subprocess.Popen(
        repeat_command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE)

    # Prepare args to ProfilerClient
    logdir = os.path.join(self.temp_dir, 'logs')
    worker_list = ''
    include_dataset_ops = True
    duration_ms = 1000
    num_tracing_attempts = 3
    os.makedirs(logdir)

    # Send a tracing request
    profiler_client.start_tracing(grpc_addr, logdir, duration_ms, worker_list,
                                  include_dataset_ops, num_tracing_attempts)

    #  Log stdout & stderr of subprocess issuing predict requests for debugging
    out, err = proc.communicate()
    print("stdout: '{}' | stderr: '{}'".format(out, err))
 def testStartTracing_ProcessInvalidAddress(self):
     with self.assertRaises(errors.UnavailableError):
         profiler_client.start_tracing('localhost:6006', '/tmp/', 2000)