def test_get_tensorboard_log_watcher_from_path_with_file_path( self, mock_directorywatcher, mock_eventfileloader, mock_issummaryeventsfile): mock_issummaryeventsfile.return_value = True tf_utils.get_tensorboard_log_watcher_from_path(self._file_path) mock_eventfileloader.assert_called_with(self._file_path) mock_directorywatcher.assert_not_called()
def test_get_remote_training_metrics(self, mock_super_tuner): remote_tuner = self._remote_tuner(None, None, self._study_config, max_trials=10) remote_tuner.directory = self.get_temp_dir() log_dir = os.path.join(remote_tuner.directory, str(self._test_trial.trial_id), "logs") with tf.summary.create_file_writer(log_dir).as_default(): tf.summary.scalar(name="epoch_loss", data=0.1, step=0) tf.summary.scalar(name="epoch_accuracy", data=0.2, step=0) tf.summary.scalar(name="epoch_loss", data=0.3, step=1) tf.summary.scalar(name="epoch_accuracy", data=0.4, step=1) tf.summary.scalar(name="epoch_loss", data=0.5, step=2) tf.summary.scalar(name="epoch_accuracy", data=0.6, step=2) log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_dir) results = remote_tuner._get_remote_training_metrics(log_reader, {}) self.assertLen(results.completed_epoch_metrics, 2) self.assertIn("accuracy", results.completed_epoch_metrics[0]) self.assertIn("loss", results.completed_epoch_metrics[0]) self.assertEqual(results.completed_epoch_metrics[0].get("loss"), tf.constant(0.1))
def _get_remote_training_metrics( self, trial_id: int)-> List[Mapping[Text, Union[int, float]]]: log_path = self._get_tensorboard_log_dir(trial_id) tf.get_logger().info( "Retrieving training logs for trial {} from {}".format( trial_id, log_path)) log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_path) results = [] epoch_metrics = {} for event in log_reader.Load(): for value in event.summary.value: # Note tf.keras.callbacks.TensorBoard() with update_freq="epoch" # logs the epoch related metrics with a "epoch_" prefix. This is # not a requirement by tensorboard. if value.tag.startswith("epoch_"): metric = value.tag.replace("epoch_", "") # If we have already seen this metric, this is a new epoch if metric in epoch_metrics: results.append(epoch_metrics) epoch_metrics = {} # Note this method captures all metrics even if they are not # part of the oracle objectives. We rely on oracle to ignore # the unrelated Objectives. epoch_metrics[metric] = tf.make_ndarray( event.summary.value[0].tensor) results.append(epoch_metrics) return results
def run_trial(self, trial, *fit_args, **fit_kwargs): """Evaluates a set of hyperparameter values. This method is called during `search` to evaluate a set of hyperparameters using AI Platform training. Arguments: trial: A `Trial` instance that contains the information needed to run this trial. `Hyperparameters` can be accessed via `trial.hyperparameters`. *fit_args: Positional arguments passed by `search`. **fit_kwargs: Keyword arguments passed by `search`. Raises: RuntimeError: If AIP training job fails. """ # Running the training remotely. copied_fit_kwargs = copy.copy(fit_kwargs) # Handle any callbacks passed to `fit`. callbacks = fit_kwargs.pop("callbacks", []) callbacks = self._deepcopy_callbacks(callbacks) # Note: run_trial does not use `TunerCallback` calls, since # training is performed on AI Platform training remotely. # Handle TensorBoard/hyperparameter logging here. The TensorBoard # logs are used for passing metrics back from remote execution. self._add_logging(callbacks, trial) # Creating a save_model checkpoint callback with a saved model file path # specific to this trial. This is to prevent different trials from # overwriting each other. self._add_model_checkpoint_callback( callbacks, trial.trial_id) copied_fit_kwargs["callbacks"] = callbacks model = self.hypermodel.build(trial.hyperparameters) remote_dir = os.path.join(self.directory, str(trial.trial_id)) # TODO(b/170687807) Switch from using "{}".format() to f-string job_id = "{}_{}".format(self._study_id, trial.trial_id) # Create job spec from worker count and config job_spec = self._get_job_spec_from_config(job_id) tf.get_logger().info("Calling cloud_fit with %s", { "model": model, "remote_dir": remote_dir, "region": self._region, "project_id": self._project_id, "image_uri": self._container_uri, "job_id": job_id, "*fit_args": fit_args, "job_spec": job_spec, "**copied_fit_kwargs": copied_fit_kwargs}) cloud_fit_client.cloud_fit( model=model, remote_dir=remote_dir, region=self._region, project_id=self._project_id, image_uri=self._container_uri, job_id=job_id, job_spec=job_spec, *fit_args, **copied_fit_kwargs) # Create an instance of tensorboard DirectoryWatcher to retrieve the # logs for this trial run log_path = os.path.join( self._get_tensorboard_log_dir(trial.trial_id), "train") # Tensorboard log watcher expects the path to exist tf.io.gfile.makedirs(log_path) # TODO(b/170687807) Switch from using "{}".format() to f-string tf.get_logger().info( "Retrieving training logs for trial {} from {}".format( trial.trial_id, log_path)) log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_path) training_metrics = _TrainingMetrics([], {}) epoch = 0 while google_api_client.is_api_training_job_running( job_id, self._project_id): time.sleep(_POLLING_INTERVAL_IN_SECONDS) # Retrieve available metrics if any training_metrics = self._get_remote_training_metrics( log_reader, training_metrics.partial_epoch_metrics) for epoch_metrics in training_metrics.completed_epoch_metrics: # TODO(b/169197272) Validate metrics contain oracle objective if epoch_metrics: trial.status = self.oracle.update_trial( trial_id=trial.trial_id, metrics=epoch_metrics, step=epoch) epoch += 1 if trial.status == "STOPPED": google_api_client.stop_aip_training_job( job_id, self._project_id) break # Ensure the training job has completed successfully. if not google_api_client.wait_for_api_training_job_completion( job_id, self._project_id): raise RuntimeError( "AIP Training job failed, see logs for details at " "https://console.cloud.google.com/ai-platform/jobs/" "{}/charts/cpu?project={}" .format(job_id, self._project_id)) # Retrieve and report any remaining metrics training_metrics = self._get_remote_training_metrics( log_reader, training_metrics.partial_epoch_metrics) for epoch_metrics in training_metrics.completed_epoch_metrics: # TODO(b/169197272) Validate metrics contain oracle objective # TODO(b/170907612) Support submit partial results to Oracle if epoch_metrics: self.oracle.update_trial( trial_id=trial.trial_id, metrics=epoch_metrics, step=epoch) epoch += 1 # submit final epoch metrics if training_metrics.partial_epoch_metrics: self.oracle.update_trial( trial_id=trial.trial_id, metrics=training_metrics.partial_epoch_metrics, step=epoch)
def test_get_tensorboard_log_watcher_from_path_with_no_path( self, mock_issummaryeventsfile): with self.assertRaises(ValueError): tf_utils.get_tensorboard_log_watcher_from_path(None)