def _create_uploader( writer_client=_USE_DEFAULT, logdir=None, allowed_plugins=_USE_DEFAULT, logdir_poll_rate_limiter=_USE_DEFAULT, rpc_rate_limiter=_USE_DEFAULT, blob_rpc_rate_limiter=_USE_DEFAULT, name=None, description=None, ): if writer_client is _USE_DEFAULT: writer_client = _create_mock_client() if allowed_plugins is _USE_DEFAULT: allowed_plugins = _SCALARS_ONLY if logdir_poll_rate_limiter is _USE_DEFAULT: logdir_poll_rate_limiter = util.RateLimiter(0) if rpc_rate_limiter is _USE_DEFAULT: rpc_rate_limiter = util.RateLimiter(0) if blob_rpc_rate_limiter is _USE_DEFAULT: blob_rpc_rate_limiter = util.RateLimiter(0) return uploader_lib.TensorBoardUploader( writer_client, logdir, allowed_plugins=allowed_plugins, logdir_poll_rate_limiter=logdir_poll_rate_limiter, rpc_rate_limiter=rpc_rate_limiter, blob_rpc_rate_limiter=blob_rpc_rate_limiter, name=name, description=description, )
def _create_request_sender( experiment_id=None, api=None, allowed_plugins=_USE_DEFAULT, max_blob_size=_USE_DEFAULT, rpc_rate_limiter=_USE_DEFAULT, blob_rpc_rate_limiter=_USE_DEFAULT, ): if api is _USE_DEFAULT: api = _create_mock_client() if allowed_plugins is _USE_DEFAULT: allowed_plugins = _SCALARS_ONLY if max_blob_size is _USE_DEFAULT: max_blob_size = 12345 if rpc_rate_limiter is _USE_DEFAULT: rpc_rate_limiter = util.RateLimiter(0) if blob_rpc_rate_limiter is _USE_DEFAULT: blob_rpc_rate_limiter = util.RateLimiter(0) return uploader_lib._BatchedRequestSender( experiment_id=experiment_id, api=api, allowed_plugins=allowed_plugins, max_blob_size=max_blob_size, rpc_rate_limiter=rpc_rate_limiter, blob_rpc_rate_limiter=blob_rpc_rate_limiter, )
def __init__(self, writer_client, logdir, rate_limiter=None): """Constructs a TensorBoardUploader. Args: writer_client: a TensorBoardWriterService stub instance logdir: path of the log directory to upload rate_limiter: a `RateLimiter` to use to limit upload cycle frequency """ self._api = writer_client self._logdir = logdir self._request_builder = None if rate_limiter is None: self._rate_limiter = util.RateLimiter( _MIN_UPLOAD_CYCLE_DURATION_SECS) else: self._rate_limiter = rate_limiter active_filter = ( lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time()) directory_loader_factory = functools.partial( directory_loader.DirectoryLoader, loader_factory=event_file_loader.TimestampedEventFileLoader, path_filter=io_wrapper.IsTensorFlowEventsFile, active_filter=active_filter, ) self._logdir_loader = logdir_loader.LogdirLoader( self._logdir, directory_loader_factory)
def test_break_at_tag_boundary(self): mock_client = _create_mock_client() # Choose tag name sizes such that one tag fits, but not two. Note # that tag names appear in both `Tag.name` and the summary metadata. long_tag_1 = "a" * 384 long_tag_2 = "b" * 384 event = event_pb2.Event(step=1) event.summary.value.add(tag=long_tag_1, simple_value=1.0) event.summary.value.add(tag=long_tag_2, simple_value=2.0) run_to_events = {"train": [event]} builder = uploader_lib._BatchedRequestSender("123", mock_client, util.RateLimiter(0)) builder.send_requests(run_to_events) requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list] for request in requests: _clear_wall_times(request) expected = [ write_service_pb2.WriteScalarRequest(experiment_id="123"), write_service_pb2.WriteScalarRequest(experiment_id="123"), ] (expected[0].runs.add(name="train").tags.add( name=long_tag_1, metadata=test_util.scalar_metadata(long_tag_1)).points.add( step=1, value=1.0)) (expected[1].runs.add(name="train").tags.add( name=long_tag_2, metadata=test_util.scalar_metadata(long_tag_2)).points.add( step=1, value=2.0)) self.assertEqual(requests, expected)
def test_break_at_run_boundary(self): mock_client = _create_mock_client() # Choose run name sizes such that one run fits, but not two. long_run_1 = "A" * 768 long_run_2 = "B" * 768 event_1 = event_pb2.Event(step=1) event_1.summary.value.add(tag="foo", simple_value=1.0) event_2 = event_pb2.Event(step=2) event_2.summary.value.add(tag="bar", simple_value=-2.0) run_to_events = collections.OrderedDict([(long_run_1, [event_1]), (long_run_2, [event_2])]) builder = uploader_lib._BatchedRequestSender("123", mock_client, util.RateLimiter(0)) builder.send_requests(run_to_events) requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list] for request in requests: _clear_wall_times(request) expected = [ write_service_pb2.WriteScalarRequest(experiment_id="123"), write_service_pb2.WriteScalarRequest(experiment_id="123"), ] (expected[0].runs.add(name=long_run_1).tags.add( name="foo", metadata=test_util.scalar_metadata("foo")).points.add(step=1, value=1.0)) (expected[1].runs.add(name=long_run_2).tags.add( name="bar", metadata=test_util.scalar_metadata("bar")).points.add(step=2, value=-2.0)) self.assertEqual(requests, expected)
def test_prunes_tags_and_runs(self): mock_client = _create_mock_client() event_1 = event_pb2.Event(step=1) event_1.summary.value.add(tag="foo", simple_value=1.0) event_2 = event_pb2.Event(step=2) event_2.summary.value.add(tag="bar", simple_value=-2.0) run_to_events = collections.OrderedDict( [("train", [event_1]), ("test", [event_2])] ) real_create_point = ( uploader_lib._ScalarBatchedRequestSender._create_point ) create_point_call_count_box = [0] def mock_create_point(uploader_self, *args, **kwargs): # Simulate out-of-space error the first time that we try to store # the second point. create_point_call_count_box[0] += 1 if create_point_call_count_box[0] == 2: raise uploader_lib._OutOfSpaceError() return real_create_point(uploader_self, *args, **kwargs) with mock.patch.object( uploader_lib._ScalarBatchedRequestSender, "_create_point", mock_create_point, ): builder = uploader_lib._BatchedRequestSender( "123", mock_client, util.RateLimiter(0) ) builder.send_requests(run_to_events) requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list] for request in requests: _clear_wall_times(request) expected = [ write_service_pb2.WriteScalarRequest(experiment_id="123"), write_service_pb2.WriteScalarRequest(experiment_id="123"), ] ( expected[0] .runs.add(name="train") .tags.add(name="foo", metadata=test_util.scalar_metadata("foo")) .points.add(step=1, value=1.0) ) ( expected[1] .runs.add(name="test") .tags.add(name="bar", metadata=test_util.scalar_metadata("bar")) .points.add(step=2, value=-2.0) ) self.assertEqual(expected, requests)
def test_no_room_for_single_point(self): mock_client = _create_mock_client() event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=1.0) long_run_name = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES run_to_events = {long_run_name: [event]} with self.assertRaises(RuntimeError) as cm: builder = uploader_lib._BatchedRequestSender( "123", mock_client, util.RateLimiter(0)) builder.send_requests(run_to_events) self.assertEqual(str(cm.exception), "add_event failed despite flush")
def _populate_run_from_events(self, run_proto, events): mock_client = _create_mock_client() builder = uploader_lib._BatchedRequestSender( experiment_id="123", api=mock_client, rpc_rate_limiter=util.RateLimiter(0), ) builder.send_requests({"": events}) requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list] if requests: self.assertLen(requests, 1) self.assertLen(requests[0].runs, 1) run_proto.MergeFrom(requests[0].runs[0])
def test_no_budget_for_experiment_id(self): mock_client = _create_mock_client() event = event_pb2.Event(step=1, wall_time=123.456) event.summary.value.add(tag="foo", simple_value=1.0) run_to_events = {"run_name": [event]} long_experiment_id = "A" * uploader_lib._MAX_REQUEST_LENGTH_BYTES mock_client = _create_mock_client() with self.assertRaises(RuntimeError) as cm: builder = uploader_lib._BatchedRequestSender( long_experiment_id, mock_client, util.RateLimiter(0)) builder.send_requests(run_to_events) self.assertEqual(str(cm.exception), "Byte budget too small for experiment ID")
def __init__( self, writer_client, logdir, rpc_rate_limiter=None, name=None, description=None, ): """Constructs a TensorBoardUploader. Args: writer_client: a TensorBoardWriterService stub instance logdir: path of the log directory to upload rpc_rate_limiter: a `RateLimiter` to use to limit write RPC frequency. Note this limit applies at the level of single RPCs in the Scalar and Tensor case, but at the level of an entire blob upload in the Blob case-- which may require a few preparatory RPCs and a stream of chunks. Note the chunk stream is internally rate-limited by backpressure from the server, so it is not a concern that we do not explicitly rate-limit within the stream here. name: String name to assign to the experiment. description: String description to assign to the experiment. """ self._api = writer_client self._logdir = logdir self._name = name self._description = description self._request_sender = None if rpc_rate_limiter is None: self._rpc_rate_limiter = util.RateLimiter( _MIN_WRITE_RPC_INTERVAL_SECS) else: self._rpc_rate_limiter = rpc_rate_limiter active_filter = ( lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time()) directory_loader_factory = functools.partial( directory_loader.DirectoryLoader, loader_factory=event_file_loader.TimestampedEventFileLoader, path_filter=io_wrapper.IsTensorFlowEventsFile, active_filter=active_filter, ) self._logdir_loader = logdir_loader.LogdirLoader( self._logdir, directory_loader_factory)
def test_rate_limiting(self): rate_limiter = util.RateLimiter(10) fake_time = test_util.FakeTime(current=1000) with mock.patch.object(rate_limiter, "_time", fake_time): self.assertEqual(1000, fake_time.time()) # No sleeping for initial tick. rate_limiter.tick() self.assertEqual(1000, fake_time.time()) # Second tick requires a full sleep. rate_limiter.tick() self.assertEqual(1010, fake_time.time()) # Third tick requires a sleep just to make up the remaining second. fake_time.sleep(9) self.assertEqual(1019, fake_time.time()) rate_limiter.tick() self.assertEqual(1020, fake_time.time()) # Fourth tick requires no sleep since we have no remaining seconds. fake_time.sleep(11) self.assertEqual(1031, fake_time.time()) rate_limiter.tick() self.assertEqual(1031, fake_time.time())
def test_break_at_scalar_point_boundary(self): mock_client = _create_mock_client() point_count = 2000 # comfortably saturates a single 1024-byte request events = [] for step in range(point_count): summary = scalar_v2.scalar_pb("loss", -2.0 * step) if step > 0: summary.value[0].ClearField("metadata") events.append(event_pb2.Event(summary=summary, step=step)) run_to_events = {"train": events} builder = uploader_lib._BatchedRequestSender( "123", mock_client, util.RateLimiter(0) ) builder.send_requests(run_to_events) requests = [c[0][0] for c in mock_client.WriteScalar.call_args_list] for request in requests: _clear_wall_times(request) self.assertGreater(len(requests), 1) self.assertLess(len(requests), point_count) total_points_in_result = 0 for request in requests: self.assertLen(request.runs, 1) run = request.runs[0] self.assertEqual(run.name, "train") self.assertLen(run.tags, 1) tag = run.tags[0] self.assertEqual(tag.name, "loss") for point in tag.points: self.assertEqual(point.step, total_points_in_result) self.assertEqual(point.value, -2.0 * point.step) total_points_in_result += 1 self.assertLessEqual( request.ByteSize(), uploader_lib._MAX_REQUEST_LENGTH_BYTES ) self.assertEqual(total_points_in_result, point_count)
def create_profile_request_sender() -> profile_uploader.ProfileRequestSender: """Creates the `ProfileRequestSender` for the profile plugin. A profile request sender is created for the plugin so that after profiling runs have finished, data can be uploaded to the tensorboard backend. Returns: A ProfileRequestSender object. """ api_client = _get_api_client() experiment_name = _get_or_create_experiment( api_client, training_utils.environment_variables.cloud_ml_job_id) upload_limits = _make_upload_limits() blob_rpc_rate_limiter = util.RateLimiter( upload_limits.min_blob_request_interval / 100) blob_storage_bucket, blob_storage_folder = _get_blob_items(api_client, ) source_bucket = uploader_utils.get_source_bucket( training_utils.environment_variables.tensorboard_log_dir) profile_request_sender = profile_uploader.ProfileRequestSender( experiment_name, api_client, upload_limits=upload_limits, blob_rpc_rate_limiter=blob_rpc_rate_limiter, blob_storage_bucket=blob_storage_bucket, blob_storage_folder=blob_storage_folder, source_bucket=source_bucket, tracker=upload_tracker.UploadTracker(verbosity=1), logdir=training_utils.environment_variables.tensorboard_log_dir, ) return profile_request_sender
def __init__( self, writer_client, logdir, allowed_plugins, upload_limits, logdir_poll_rate_limiter=None, rpc_rate_limiter=None, tensor_rpc_rate_limiter=None, blob_rpc_rate_limiter=None, name=None, description=None, verbosity=None, one_shot=None, ): """Constructs a TensorBoardUploader. Args: writer_client: a TensorBoardWriterService stub instance logdir: path of the log directory to upload allowed_plugins: collection of string plugin names; events will only be uploaded if their time series's metadata specifies one of these plugin names upload_limits: instance of tensorboard.service.UploadLimits proto. logdir_poll_rate_limiter: a `RateLimiter` to use to limit logdir polling frequency, to avoid thrashing disks, especially on networked file systems rpc_rate_limiter: a `RateLimiter` to use to limit write RPC frequency. Note this limit applies at the level of single RPCs in the Scalar and Tensor case, but at the level of an entire blob upload in the Blob case-- which may require a few preparatory RPCs and a stream of chunks. Note the chunk stream is internally rate-limited by backpressure from the server, so it is not a concern that we do not explicitly rate-limit within the stream here. name: String name to assign to the experiment. description: String description to assign to the experiment. verbosity: Level of verbosity, an integer. Supported value: 0 - No upload statistics is printed. 1 - Print upload statistics while uploading data (default). one_shot: Once uploading starts, upload only the existing data in the logdir and then return immediately, instead of the default behavior of continuing to listen for new data in the logdir and upload them when it appears. """ self._api = writer_client self._logdir = logdir self._allowed_plugins = frozenset(allowed_plugins) self._upload_limits = upload_limits self._name = name self._description = description self._verbosity = 1 if verbosity is None else verbosity self._one_shot = False if one_shot is None else one_shot self._request_sender = None if logdir_poll_rate_limiter is None: self._logdir_poll_rate_limiter = util.RateLimiter( _MIN_LOGDIR_POLL_INTERVAL_SECS) else: self._logdir_poll_rate_limiter = logdir_poll_rate_limiter if rpc_rate_limiter is None: self._rpc_rate_limiter = util.RateLimiter( self._upload_limits.min_scalar_request_interval / 1000) else: self._rpc_rate_limiter = rpc_rate_limiter if tensor_rpc_rate_limiter is None: self._tensor_rpc_rate_limiter = util.RateLimiter( self._upload_limits.min_tensor_request_interval / 1000) else: self._tensor_rpc_rate_limiter = tensor_rpc_rate_limiter if blob_rpc_rate_limiter is None: self._blob_rpc_rate_limiter = util.RateLimiter( self._upload_limits.min_blob_request_interval / 1000) else: self._blob_rpc_rate_limiter = blob_rpc_rate_limiter active_filter = ( lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time()) directory_loader_factory = functools.partial( directory_loader.DirectoryLoader, loader_factory=event_file_loader.TimestampedEventFileLoader, path_filter=io_wrapper.IsTensorFlowEventsFile, active_filter=active_filter, ) self._logdir_loader = logdir_loader.LogdirLoader( self._logdir, directory_loader_factory)
def __init__( self, writer_client, logdir, allowed_plugins, max_blob_size, logdir_poll_rate_limiter=None, rpc_rate_limiter=None, blob_rpc_rate_limiter=None, name=None, description=None, ): """Constructs a TensorBoardUploader. Args: writer_client: a TensorBoardWriterService stub instance logdir: path of the log directory to upload allowed_plugins: collection of string plugin names; events will only be uploaded if their time series's metadata specifies one of these plugin names max_blob_size: the maximum allowed size for blob uploads. logdir_poll_rate_limiter: a `RateLimiter` to use to limit logdir polling frequency, to avoid thrashing disks, especially on networked file systems rpc_rate_limiter: a `RateLimiter` to use to limit write RPC frequency. Note this limit applies at the level of single RPCs in the Scalar and Tensor case, but at the level of an entire blob upload in the Blob case-- which may require a few preparatory RPCs and a stream of chunks. Note the chunk stream is internally rate-limited by backpressure from the server, so it is not a concern that we do not explicitly rate-limit within the stream here. name: String name to assign to the experiment. description: String description to assign to the experiment. """ self._api = writer_client self._logdir = logdir self._allowed_plugins = frozenset(allowed_plugins) self._max_blob_size = max_blob_size self._name = name self._description = description self._request_sender = None if logdir_poll_rate_limiter is None: self._logdir_poll_rate_limiter = util.RateLimiter( _MIN_LOGDIR_POLL_INTERVAL_SECS ) else: self._logdir_poll_rate_limiter = logdir_poll_rate_limiter if rpc_rate_limiter is None: self._rpc_rate_limiter = util.RateLimiter( _MIN_WRITE_RPC_INTERVAL_SECS ) else: self._rpc_rate_limiter = rpc_rate_limiter if blob_rpc_rate_limiter is None: self._blob_rpc_rate_limiter = util.RateLimiter( _MIN_BLOB_WRITE_RPC_INTERVAL_SECS ) else: self._blob_rpc_rate_limiter = blob_rpc_rate_limiter active_filter = ( lambda secs: secs + _EVENT_FILE_INACTIVE_SECS >= time.time() ) directory_loader_factory = functools.partial( directory_loader.DirectoryLoader, loader_factory=event_file_loader.TimestampedEventFileLoader, path_filter=io_wrapper.IsTensorFlowEventsFile, active_filter=active_filter, ) self._logdir_loader = logdir_loader.LogdirLoader( self._logdir, directory_loader_factory )