def save(self, trial, storage=Checkpoint.PERSISTENT, result=None): """Saves the trial's state to a checkpoint asynchronously. Args: trial (Trial): The trial to be saved. storage (str): Where to store the checkpoint. Defaults to PERSISTENT. result (dict): The state of this trial as a dictionary to be saved. If result is None, the trial's last result will be used. Returns: Checkpoint object, or None if an Exception occurs. """ result = result or trial.last_result with self._change_working_directory(trial): if storage == Checkpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = Checkpoint(storage, value, result) trial.on_checkpoint(checkpoint) else: value = trial.runner.save.remote() checkpoint = Checkpoint(storage, value, result) trial.saving_to = checkpoint self._running[value] = trial return checkpoint
def save( self, trial, storage=Checkpoint.PERSISTENT, result: Optional[Dict] = None ) -> Checkpoint: """Saves the trial's state to a checkpoint asynchronously. Args: trial (Trial): The trial to be saved. storage (str): Where to store the checkpoint. Defaults to PERSISTENT. result (dict): The state of this trial as a dictionary to be saved. If result is None, the trial's last result will be used. Returns: Checkpoint object, or None if an Exception occurs. """ logger.info(f"saving trial {trial}") result = result or trial.last_result with self._change_working_directory(trial): if storage == Checkpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = Checkpoint(storage, value, result) trial.on_checkpoint(checkpoint) else: value = trial.runner.save.remote() checkpoint = Checkpoint(storage, value, result) trial.saving_to = checkpoint self._futures[value] = (ExecutorEventType.SAVING_RESULT, trial) return checkpoint
def save(self, trial, storage=Checkpoint.DISK, result=None): """Saves the trial's state to a checkpoint.""" result = result or trial.last_result if storage == Checkpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = Checkpoint(storage, value, result) else: with warn_if_slow("save_checkpoint_to_disk"): value = ray.get(trial.runner.save.remote()) checkpoint = Checkpoint(storage, value, result) with warn_if_slow("on_checkpoint", DEFAULT_GET_TIMEOUT) as profile: try: trial.on_checkpoint(checkpoint) except Exception: logger.exception("Trial %s: Error handling checkpoint %s", trial, checkpoint.value) return None if profile.too_slow and trial.sync_on_checkpoint: logger.warning( "Consider turning off forced head-worker trial checkpoint " "syncs by setting sync_on_checkpoint=False. Note that this " "might result in faulty trial restoration for some worker " "failure modes.") return checkpoint.value
def _exploit_trial(self, trial_executor: RayTrialExecutor, trial: Trial, trial_to_clone: Trial): """ Transfers perturbed state from trial_to_clone -> trial. If specified, also logs the updated hyperparam state. """ trial_state = self._trials_states_dict[trial] new_state = self._trials_states_dict[trial_to_clone] if not new_state.last_checkpoint: logger.info( "[pbt]: no checkpoint for trial. Skip exploit for Trial {}". format(trial)) return new_config = explore(trial_to_clone.config, self._hyperparam_mutations, self._hyperparam_mutate_probability, self._explore_func) logger.info( "[exploit] transferring weights from trial {} (score {}) -> {} (score {})" .format(trial_to_clone, new_state.last_score, trial, trial_state.last_score)) if self._log_config: self._log_config_on_step(trial_state, new_state, trial, trial_to_clone, new_config) new_tag = make_experiment_tag(trial_state.orig_tag, new_config, self._hyperparam_mutations) reset_successful = trial_executor.reset_trial(trial, new_config, new_tag) if reset_successful: trial_executor.restore( trial, Checkpoint.from_object(new_state.last_checkpoint)) else: trial_executor.stop_trial(trial, stop_logger=False) trial.config = new_config trial.experiment_tag = new_tag trial_executor.start_trial( trial, Checkpoint.from_object(new_state.last_checkpoint)) # TODO: move to Exploiter new_state.num_steps = 0 trial_state.num_steps = 0 new_state.num_explorations = 0 trial_state.num_explorations += 1 self._num_explorations += 1 # Transfer over the last perturbation time as well trial_state.last_perturbation_time = new_state.last_perturbation_time
def _exploit(self, trial_executor, trial, trial_to_clone): """Transfers perturbed state from trial_to_clone -> trial.""" trial_state = self._trial_state[trial] new_state = self._trial_state[trial_to_clone] if not new_state.last_checkpoint: print("[pbt] warn: no checkpoint for trial, skip exploit", trial) return new_config = explore(trial_to_clone.config, self._hyperparam_mutations, self._resample_probability, self._custom_explore_fn) print("[exploit] transferring weights from trial " "{} (score {}) -> {} (score {})".format(trial_to_clone, new_state.last_score, trial, trial_state.last_score)) # TODO(ekl) restarting the trial is expensive. We should implement a # lighter way reset() method that can alter the trial config. trial_executor.stop_trial(trial, stop_logger=False) trial.config = new_config trial.experiment_tag = make_experiment_tag(trial_state.orig_tag, new_config, self._hyperparam_mutations) trial_executor.start_trial( trial, Checkpoint.from_object(new_state.last_checkpoint)) self._num_perturbations += 1 # Transfer over the last perturbation time as well trial_state.last_perturbation_time = new_state.last_perturbation_time
def _exploit(self, trial_executor, trial, trial_to_clone): """Transfers perturbed state from trial_to_clone -> trial.""" trial_state = self._trial_state[trial] new_state = self._trial_state[trial_to_clone] if not new_state.last_checkpoint: logger.warning("[pbt]: no checkpoint for trial." " Skip exploit for Trial {}".format(trial)) return new_config = explore(trial_to_clone.config, self._hyperparam_mutations, self._resample_probability, self._custom_explore_fn) logger.warning("[exploit] transferring weights from trial " "{} (score {}) -> {} (score {})".format( trial_to_clone, new_state.last_score, trial, trial_state.last_score)) # TODO(ekl) restarting the trial is expensive. We should implement a # lighter way reset() method that can alter the trial config. new_tag = make_experiment_tag(trial_state.orig_tag, new_config, self._hyperparam_mutations) reset_successful = trial_executor.reset_trial(trial, new_config, new_tag) if not reset_successful: trial_executor.stop_trial(trial, stop_logger=False) trial.config = new_config trial.experiment_tag = new_tag trial_executor.start_trial( trial, Checkpoint.from_object(new_state.last_checkpoint)) self._num_perturbations += 1 # Transfer over the last perturbation time as well trial_state.last_perturbation_time = new_state.last_perturbation_time
def save(self, trial, storage=Checkpoint.PERSISTENT, result=None): """Saves the trial's state to a checkpoint. Args: trial (Trial): The state of this trial to be saved. storage (str): Where to store the checkpoint. Defaults to PERSISTENT. result (dict): The state of this trial as a dictionary to be saved. If result is None, the trial's last result will be used. Returns: Checkpoint future, or None if an Exception occurs. """ result = result or trial.last_result with self._change_working_directory(trial): if storage == Checkpoint.MEMORY: value = trial.runner.save_to_object.remote() checkpoint = Checkpoint(storage, value, result) else: with warn_if_slow("save_checkpoint_to_storage"): # TODO(ujvl): Make this asynchronous. value = ray.get(trial.runner.save.remote()) checkpoint = Checkpoint(storage, value, result) with warn_if_slow("on_checkpoint", DEFAULT_GET_TIMEOUT) as profile: try: trial.on_checkpoint(checkpoint) except Exception: logger.exception("Trial %s: Error handling checkpoint %s", trial, checkpoint.value) return None if profile.too_slow and trial.sync_on_checkpoint: logger.warning( "Consider turning off forced head-worker trial checkpoint " "syncs by setting sync_on_checkpoint=False. Note that this " "might result in faulty trial restoration for some worker " "failure modes.") return checkpoint.value
def checkpoint(self): return Checkpoint(Checkpoint.MEMORY, "None", {})
def save(self, trial, type=Checkpoint.PERSISTENT, result=None): return Checkpoint(Checkpoint.PERSISTENT, trial.trainable_name, result)
def _exploit(self, trial_executor, trial, trial_to_clone): """Transfers perturbed state from trial_to_clone -> trial. If specified, also logs the updated hyperparam state. """ trial_state = self._trial_state[trial] new_state = self._trial_state[trial_to_clone] if not new_state.last_checkpoint: logger.info("[pbt]: no checkpoint for trial." " Skip exploit for Trial {}".format(trial)) return # if we are at a new timestep, we dont want to penalise for trials still going if self.data['T'].max() > self.latest: self.current = None print("\n\n\n\n Copying: \n{} \n with:{} \n\n".format( str(trial), str(trial_to_clone))) new_config, lengthscale, mindist, meandist, data = explore( self.data, self.bounds, self.current, trial_to_clone, trial, trial_to_clone.config, self._hyperparam_mutations, self._resample_probability) # important to replace the old values, since we are copying across self.data = data.copy() # if the current guy youre selecting is at a point youve already done, # then append the data to the "current" which is the points in the current batch new = [] for key in self._hyperparam_mutations.keys(): new.append(new_config[key]) new = np.array(new) new = new.reshape(1, new.size) if self.data['T'].max() > self.latest: self.latest = self.data['T'].max() self.current = new.copy() else: self.current = np.concatenate((self.current, new), axis=0) print("\n\n\n\n\n Currently Evaluating \n\n\n\n\n") print(self.current) print("\n\n\n\n\n") # log the lengthscale self.meta['timesteps'].append(self.data['T'].values[-1]) self.meta['lengthscales'].append(lengthscale) self.meta['closest'].append(mindist) self.meta['meandist'].append(meandist) meta = pd.DataFrame({ 'timesteps': self.meta['timesteps'], 'lengthscales': self.meta['lengthscales'], 'closest': self.meta['closest'], 'meandist': self.meta['meandist'] }) meta.to_csv('meta_data.csv') logger.info("[exploit] transferring weights from trial " "{} (score {}) -> {} (score {})".format( trial_to_clone, new_state.last_score, trial, trial_state.last_score)) if self._log_config: self._log_config_on_step(trial_state, new_state, trial, trial_to_clone, new_config) new_tag = make_experiment_tag(trial_state.orig_tag, new_config, self._hyperparam_mutations) reset_successful = trial_executor.reset_trial(trial, new_config, new_tag) if reset_successful: trial_executor.restore( trial, Checkpoint.from_object(new_state.last_checkpoint)) else: trial_executor.stop_trial(trial, stop_logger=False) trial.config = new_config trial.experiment_tag = new_tag trial_executor.start_trial( trial, Checkpoint.from_object(new_state.last_checkpoint)) self._num_perturbations += 1 # Transfer over the last perturbation time as well trial_state.last_perturbation_time = new_state.last_perturbation_time