def test_get_tensorboard_log_watcher_from_path_with_file_path(
            self, mock_directorywatcher, mock_eventfileloader,
            mock_issummaryeventsfile):

        mock_issummaryeventsfile.return_value = True
        tf_utils.get_tensorboard_log_watcher_from_path(self._file_path)
        mock_eventfileloader.assert_called_with(self._file_path)
        mock_directorywatcher.assert_not_called()
Beispiel #2
0
    def test_get_remote_training_metrics(self, mock_super_tuner):
        remote_tuner = self._remote_tuner(None,
                                          None,
                                          self._study_config,
                                          max_trials=10)

        remote_tuner.directory = self.get_temp_dir()
        log_dir = os.path.join(remote_tuner.directory,
                               str(self._test_trial.trial_id), "logs")

        with tf.summary.create_file_writer(log_dir).as_default():
            tf.summary.scalar(name="epoch_loss", data=0.1, step=0)
            tf.summary.scalar(name="epoch_accuracy", data=0.2, step=0)
            tf.summary.scalar(name="epoch_loss", data=0.3, step=1)
            tf.summary.scalar(name="epoch_accuracy", data=0.4, step=1)
            tf.summary.scalar(name="epoch_loss", data=0.5, step=2)
            tf.summary.scalar(name="epoch_accuracy", data=0.6, step=2)

        log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_dir)
        results = remote_tuner._get_remote_training_metrics(log_reader, {})

        self.assertLen(results.completed_epoch_metrics, 2)
        self.assertIn("accuracy", results.completed_epoch_metrics[0])
        self.assertIn("loss", results.completed_epoch_metrics[0])
        self.assertEqual(results.completed_epoch_metrics[0].get("loss"),
                         tf.constant(0.1))
Beispiel #3
0
    def _get_remote_training_metrics(
        self, trial_id: int)-> List[Mapping[Text, Union[int, float]]]:

        log_path = self._get_tensorboard_log_dir(trial_id)
        tf.get_logger().info(
            "Retrieving training logs for trial {} from {}".format(
                trial_id, log_path))

        log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_path)
        results = []
        epoch_metrics = {}
        for event in log_reader.Load():
            for value in event.summary.value:
                # Note tf.keras.callbacks.TensorBoard() with update_freq="epoch"
                # logs the epoch related metrics with a "epoch_" prefix. This is
                # not a requirement by tensorboard.
                if value.tag.startswith("epoch_"):
                    metric = value.tag.replace("epoch_", "")
                    # If we have already seen this metric, this is a new epoch
                    if metric in epoch_metrics:
                        results.append(epoch_metrics)
                        epoch_metrics = {}
                    # Note this method captures all metrics even if they are not
                    # part of the oracle objectives. We rely on oracle to ignore
                    # the unrelated Objectives.
                    epoch_metrics[metric] = tf.make_ndarray(
                        event.summary.value[0].tensor)
        results.append(epoch_metrics)
        return results
Beispiel #4
0
    def run_trial(self, trial, *fit_args, **fit_kwargs):
        """Evaluates a set of hyperparameter values.

        This method is called during `search` to evaluate a set of
        hyperparameters using AI Platform training.
        Arguments:
            trial: A `Trial` instance that contains the information
              needed to run this trial. `Hyperparameters` can be accessed
              via `trial.hyperparameters`.
            *fit_args: Positional arguments passed by `search`.
            **fit_kwargs: Keyword arguments passed by `search`.
        Raises:
            RuntimeError: If AIP training job fails.
        """

        # Running the training remotely.
        copied_fit_kwargs = copy.copy(fit_kwargs)

        # Handle any callbacks passed to `fit`.
        callbacks = fit_kwargs.pop("callbacks", [])
        callbacks = self._deepcopy_callbacks(callbacks)

        # Note: run_trial does not use `TunerCallback` calls, since
        # training is performed on AI Platform training remotely.

        # Handle TensorBoard/hyperparameter logging here. The TensorBoard
        # logs are used for passing metrics back from remote execution.
        self._add_logging(callbacks, trial)

        # Creating a save_model checkpoint callback with a saved model file path
        # specific to this trial. This is to prevent different trials from
        # overwriting each other.
        self._add_model_checkpoint_callback(
            callbacks, trial.trial_id)

        copied_fit_kwargs["callbacks"] = callbacks
        model = self.hypermodel.build(trial.hyperparameters)

        remote_dir = os.path.join(self.directory, str(trial.trial_id))

        # TODO(b/170687807) Switch from using "{}".format() to f-string
        job_id = "{}_{}".format(self._study_id, trial.trial_id)

        # Create job spec from worker count and config
        job_spec = self._get_job_spec_from_config(job_id)

        tf.get_logger().info("Calling cloud_fit with %s", {
            "model": model,
            "remote_dir": remote_dir,
            "region": self._region,
            "project_id": self._project_id,
            "image_uri": self._container_uri,
            "job_id": job_id,
            "*fit_args": fit_args,
            "job_spec": job_spec,
            "**copied_fit_kwargs": copied_fit_kwargs})

        cloud_fit_client.cloud_fit(
            model=model,
            remote_dir=remote_dir,
            region=self._region,
            project_id=self._project_id,
            image_uri=self._container_uri,
            job_id=job_id,
            job_spec=job_spec,
            *fit_args,
            **copied_fit_kwargs)

        # Create an instance of tensorboard DirectoryWatcher to retrieve the
        # logs for this trial run
        log_path = os.path.join(
            self._get_tensorboard_log_dir(trial.trial_id), "train")

        # Tensorboard log watcher expects the path to exist
        tf.io.gfile.makedirs(log_path)

        # TODO(b/170687807) Switch from using "{}".format() to f-string
        tf.get_logger().info(
            "Retrieving training logs for trial {} from {}".format(
                trial.trial_id, log_path))
        log_reader = tf_utils.get_tensorboard_log_watcher_from_path(log_path)

        training_metrics = _TrainingMetrics([], {})
        epoch = 0

        while google_api_client.is_api_training_job_running(
            job_id, self._project_id):

            time.sleep(_POLLING_INTERVAL_IN_SECONDS)

            # Retrieve available metrics if any
            training_metrics = self._get_remote_training_metrics(
                log_reader, training_metrics.partial_epoch_metrics)

            for epoch_metrics in training_metrics.completed_epoch_metrics:
                # TODO(b/169197272) Validate metrics contain oracle objective
                if epoch_metrics:
                    trial.status = self.oracle.update_trial(
                        trial_id=trial.trial_id,
                        metrics=epoch_metrics,
                        step=epoch)
                    epoch += 1

            if trial.status == "STOPPED":
                google_api_client.stop_aip_training_job(
                    job_id, self._project_id)
                break

        # Ensure the training job has completed successfully.
        if not google_api_client.wait_for_api_training_job_completion(
            job_id, self._project_id):
            raise RuntimeError(
                "AIP Training job failed, see logs for details at "
                "https://console.cloud.google.com/ai-platform/jobs/"
                "{}/charts/cpu?project={}"
                .format(job_id, self._project_id))

        # Retrieve and report any remaining metrics
        training_metrics = self._get_remote_training_metrics(
            log_reader, training_metrics.partial_epoch_metrics)

        for epoch_metrics in training_metrics.completed_epoch_metrics:
            # TODO(b/169197272) Validate metrics contain oracle objective
            # TODO(b/170907612) Support submit partial results to Oracle
            if epoch_metrics:
                self.oracle.update_trial(
                    trial_id=trial.trial_id,
                    metrics=epoch_metrics,
                    step=epoch)
                epoch += 1

        # submit final epoch metrics
        if training_metrics.partial_epoch_metrics:
            self.oracle.update_trial(
                trial_id=trial.trial_id,
                metrics=training_metrics.partial_epoch_metrics,
                step=epoch)
Beispiel #5
0
    def test_get_tensorboard_log_watcher_from_path_with_no_path(
        self, mock_issummaryeventsfile):

        with self.assertRaises(ValueError):
            tf_utils.get_tensorboard_log_watcher_from_path(None)