Esempi in Python per Trial.on_checkpoint

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: ray.tune.trial

Classe/tipologia: Trial

Metodo/funzione: on_checkpoint

Esempi su hotexamples.com: 6

Trial.on_checkpoint in Python: 6 esempi trovati. Questi sono i migliori esempi reali in Python per ray.tune.trial.Trial.on_checkpoint, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Trial(14)

generate_id(11)

has_checkpoint(6)

on_checkpoint(6)

experiment_tag(5)

config(5)

set_experiment_tag(5)

set_config(5)

start(4)

__setstate__(4)

set_status(4)

last_result(4)

set_runner(3)

set_location(3)

create_logdir(2)

saving_to(2)

write_error_log(2)

get_trainable_cls(2)

restoring_from(1)

pause(1)

__getstate__(1)

logdir(1)

init_logger(1)

init_logdir(1)

has_new_resources(1)

status(1)

stop(1)

update_resources(1)

placement_group_factory(1)

Esempio n. 1

Mostra file

File: ray_trial_executor.py Progetto: patrickstuedi/ray

    def save(
        self,
        trial: Trial,
        storage: str = _TuneCheckpoint.PERSISTENT,
        result: Optional[Dict] = None,
    ) -> _TuneCheckpoint:
        """Saves the trial's state to a checkpoint asynchronously.

        Args:
            trial: The trial to be saved.
            storage: Where to store the checkpoint. Defaults to
                PERSISTENT.
            result: The state of this trial as a dictionary to be saved.
                If result is None, the trial's last result will be used.

        Returns:
             Checkpoint object, or None if an Exception occurs.
        """
        logger.debug(f"saving trial {trial}")
        result = result or trial.last_result
        with self._change_working_directory(trial):
            if storage == _TuneCheckpoint.MEMORY:
                value = trial.runner.save_to_object.remote()
                checkpoint = _TuneCheckpoint(storage, value, result)
                trial.on_checkpoint(checkpoint)
            else:
                value = trial.runner.save.remote()
                checkpoint = _TuneCheckpoint(storage, value, result)
                trial.saving_to = checkpoint
                self._futures[value] = (ExecutorEventType.SAVING_RESULT, trial)
        return checkpoint

Esempio n. 2

Mostra file

    def _exploit(
        self,
        trial_executor: "trial_runner.RayTrialExecutor",
        trial: Trial,
        trial_to_clone: Trial,
    ):
        """Transfers perturbed state from trial_to_clone -> trial.

        If specified, also logs the updated hyperparam state.
        """
        trial_state = self._trial_state[trial]
        new_state = self._trial_state[trial_to_clone]
        logger.info("[exploit] transferring weights from trial "
                    "{} (score {}) -> {} (score {})".format(
                        trial_to_clone, new_state.last_score, trial,
                        trial_state.last_score))

        new_config = self._get_new_config(trial, trial_to_clone)

        # Only log mutated hyperparameters and not entire config.
        old_hparams = {
            k: v
            for k, v in trial_to_clone.config.items()
            if k in self._hyperparam_mutations
        }
        new_hparams = {
            k: v
            for k, v in new_config.items() if k in self._hyperparam_mutations
        }
        logger.info("[explore] perturbed config from {} -> {}".format(
            old_hparams, new_hparams))

        if self._log_config:
            self._log_config_on_step(trial_state, new_state, trial,
                                     trial_to_clone, new_config)

        new_tag = _make_experiment_tag(trial_state.orig_tag, new_config,
                                       self._hyperparam_mutations)
        if trial.status == Trial.PAUSED:
            # If trial is paused we update it with a new checkpoint.
            # When the trial is started again, the new checkpoint is used.
            if not self._synch:
                raise TuneError("Trials should be paused here only if in "
                                "synchronous mode. If you encounter this error"
                                " please raise an issue on Ray Github.")
        else:
            trial_executor.stop_trial(trial)
            trial_executor.set_status(trial, Trial.PAUSED)
        trial.set_experiment_tag(new_tag)
        trial.set_config(new_config)
        trial.on_checkpoint(new_state.last_checkpoint)

        self._num_perturbations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time
        trial_state.last_train_time = new_state.last_train_time

Esempio n. 3

Mostra file

        def write_checkpoint(trial: Trial, index: int):
            checkpoint_dir = TrainableUtil.make_checkpoint_dir(trial.logdir,
                                                               index=index)
            result = {"training_iteration": index}
            with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f:
                json.dump(result, f)

            tune_cp = _TuneCheckpoint(_TuneCheckpoint.PERSISTENT,
                                      checkpoint_dir, result)
            trial.saving_to = tune_cp
            trial.on_checkpoint(tune_cp)

            return checkpoint_dir

Esempio n. 4

Mostra file

File: pbt.py Progetto: smorad/ray

    def on_trial_result(
        self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict
    ) -> str:
        if TRAINING_ITERATION not in result:
            # No time reported
            return TrialScheduler.CONTINUE

        if not self._next_policy:
            # No more changes in the config
            return TrialScheduler.CONTINUE

        step = result[TRAINING_ITERATION]
        self._current_step = step

        change_at, new_config = self._next_policy

        if step < change_at:
            # Don't change the policy just yet
            return TrialScheduler.CONTINUE

        logger.info(
            "Population Based Training replay is now at step {}. "
            "Configuration will be changed to {}.".format(step, new_config)
        )

        checkpoint = trial_runner.trial_executor.save(
            trial, _TuneCheckpoint.MEMORY, result=result
        )

        new_tag = make_experiment_tag(self.experiment_tag, new_config, new_config)

        trial_executor = trial_runner.trial_executor
        trial_executor.stop_trial(trial)
        trial_executor.set_status(trial, Trial.PAUSED)
        trial.set_experiment_tag(new_tag)
        trial.set_config(new_config)
        trial.on_checkpoint(checkpoint)

        self.current_config = new_config
        self._num_perturbations += 1
        self._next_policy = next(self._policy_iter, None)

        return TrialScheduler.NOOP

Esempio n. 5

Mostra file

    def _exploit(self, trial_executor: "trial_executor.TrialExecutor",
                 trial: Trial, trial_to_clone: Trial):
        """Transfers perturbed state from trial_to_clone -> trial.

        If specified, also logs the updated hyperparam state.
        """
        trial_state = self._trial_state[trial]
        new_state = self._trial_state[trial_to_clone]
        logger.info("[exploit] transferring weights from trial "
                    "{} (score {}) -> {} (score {})".format(
                        trial_to_clone, new_state.last_score, trial,
                        trial_state.last_score))

        new_config = self._get_new_config(trial, trial_to_clone)

        # Only log mutated hyperparameters and not entire config.
        old_hparams = {
            k: v
            for k, v in trial_to_clone.config.items()
            if k in self._hyperparam_mutations
        }
        new_hparams = {
            k: v
            for k, v in new_config.items() if k in self._hyperparam_mutations
        }
        logger.info("[explore] perturbed config from {} -> {}".format(
            old_hparams, new_hparams))

        if self._log_config:
            self._log_config_on_step(trial_state, new_state, trial,
                                     trial_to_clone, new_config)

        new_tag = make_experiment_tag(trial_state.orig_tag, new_config,
                                      self._hyperparam_mutations)
        if trial.status == Trial.PAUSED:
            # If trial is paused we update it with a new checkpoint.
            # When the trial is started again, the new checkpoint is used.
            if not self._synch:
                raise TuneError("Trials should be paused here only if in "
                                "synchronous mode. If you encounter this error"
                                " please raise an issue on Ray Github.")
            trial.set_experiment_tag(new_tag)
            trial.set_config(new_config)
            trial.on_checkpoint(new_state.last_checkpoint)
        else:
            # If trial is running, we first try to reset it.
            # If that is unsuccessful, then we have to stop it and start it
            # again with a new checkpoint.
            reset_successful = trial_executor.reset_trial(
                trial, new_config, new_tag)
            # TODO(ujvl): Refactor Scheduler abstraction to abstract
            #  mechanism for trial restart away. We block on restore
            #  and suppress train on start as a stop-gap fix to
            #  https://github.com/ray-project/ray/issues/7258.
            if reset_successful:
                trial_executor.restore(trial,
                                       new_state.last_checkpoint,
                                       block=True)
            else:
                trial_executor.stop_trial(trial)
                trial.set_experiment_tag(new_tag)
                trial.set_config(new_config)
                trial_executor.start_trial(trial,
                                           new_state.last_checkpoint,
                                           train=False)

        self._num_perturbations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time
        trial_state.last_train_time = new_state.last_train_time

Esempio n. 6

Mostra file

File: pbt.py Progetto: verystrongjoe/ray

    def _exploit(self, trial_executor: "trial_executor.TrialExecutor",
                 trial: Trial, trial_to_clone: Trial):
        """Transfers perturbed state from trial_to_clone -> trial.

        If specified, also logs the updated hyperparam state.
        """
        trial_state = self._trial_state[trial]
        new_state = self._trial_state[trial_to_clone]
        if not new_state.last_checkpoint:
            logger.info("[pbt]: no checkpoint for trial."
                        " Skip exploit for Trial {}".format(trial))
            return

        new_config = explore(trial_to_clone.config, self._hyperparam_mutations,
                             self._resample_probability,
                             self._custom_explore_fn)

        # todo : 여기에서 new_config를 하고 변경된 파라메터를 돌리는작업이 필요함
        old_bad_config = trial_to_clone.config
        old_good_config = trial.config

        print('-------------------------------- perturbation---------------------------')
        print(f'trial_to_clone.trial_id = {trial_to_clone.trial_id}')
        print(f'trial.trial_id = {trial.trial_id}')

        if self._ucb is not None:
            if self._ucb.is_need_to_reflect_reward():
                # worker가 늦게 돌아가는 경우가 있어서 last_result가 None인 경우를 배제하고 평균값으로 대체!!
                score = np.average([t.last_result[self._metric] for t in self._trial_state if self._metric in t.last_result.keys()])
                self._ucb.reflect_reward(score)

            selected = self._ucb.pull()
            masks = self._ucb.bitfield(selected)
            print(f'explore!!!!!!! ucb_state n: {self._ucb.n}, selected : {self._ucb.selected}, masks : {masks}')

            for i in range(self._ucb.n_params):
                if masks[i] == 0:
                    key = list(new_config.keys())[i]
                    new_config[key] = old_good_config[key]

        # todo: perturb취소하는 로직 추가
        print(new_config)


        if self._ucb is not None:
            logger.error("[explore] perturbed ucb config from {} -> {}".format(
                old_good_config, new_config))

        # logger.info("[explore] perturbed ucb config from {} -> {}".format(
        #     old_good_config, new_config))
        

        logger.info("[exploit] transferring weights from trial "
                    "{} (score {}) -> {} (score {})".format(
                        trial_to_clone, new_state.last_score, trial,
                        trial_state.last_score))
        # Only log mutated hyperparameters and not entire config.
        old_hparams = {
            k: v
            for k, v in trial_to_clone.config.items()
            if k in self._hyperparam_mutations
        }
        new_hparams = {
            k: v
            for k, v in new_config.items() if k in self._hyperparam_mutations
        }
        logger.info("[explore] perturbed config from {} -> {}".format(
            old_hparams, new_hparams))

        if self._log_config:
            self._log_config_on_step(trial_state, new_state, trial,
                                     trial_to_clone, new_config)

        new_tag = make_experiment_tag(trial_state.orig_tag, new_config,
                                      self._hyperparam_mutations)
        if trial.status == Trial.PAUSED:
            # If trial is paused we update it with a new checkpoint.
            # When the trial is started again, the new checkpoint is used.
            if not self._synch:
                raise TuneError("Trials should be paused here only if in "
                                "synchronous mode. If you encounter this error"
                                " please raise an issue on Ray Github.")
            trial.config = new_config
            trial.experiment_tag = new_tag
            trial.on_checkpoint(new_state.last_checkpoint)
        else:
            # If trial is running, we first try to reset it.
            # If that is unsuccessful, then we have to stop it and start it
            # again with a new checkpoint.
            reset_successful = trial_executor.reset_trial(
                trial, new_config, new_tag)
            # TODO(ujvl): Refactor Scheduler abstraction to abstract
            #  mechanism for trial restart away. We block on restore
            #  and suppress train on start as a stop-gap fix to
            #  https://github.com/ray-project/ray/issues/7258.
            if reset_successful:
                trial_executor.restore(
                    trial, new_state.last_checkpoint, block=True)
            else:
                trial_executor.stop_trial(trial, stop_logger=False)
                trial.config = new_config
                trial.experiment_tag = new_tag
                trial_executor.start_trial(
                    trial, new_state.last_checkpoint, train=False)

        self._num_perturbations += 1
        # Transfer over the last perturbation time as well
        trial_state.last_perturbation_time = new_state.last_perturbation_time
        trial_state.last_train_time = new_state.last_train_time