def _on_step(self): if self.n_calls % self._check_freq == 0: sync_envs_normalization(self.training_env, self._eval_env) video, rewards = render_trajectory(self._eval_env, self.model) self.logger.record("trajectory/video", video) self.logger.record("trajectory/return", sum(rewards)) return True
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # self.n_eval_episodes = len(self.eval_env.get_attr("config")) episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, ) mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print(f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}") # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) dists = [10/reward for reward in episode_rewards] mean_dist = np.mean(dists) mean_reward = 1/mean_dist self.last_mean_reward = mean_reward if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") print(f"New best mean reward: {mean_reward:.4f} with mean distance: {mean_dist:.4f}") self.best_mean_reward = mean_reward if self.best_model_save_path is not None and self.best_mean_reward > self.best_reward: self.best_reward = self.best_mean_reward self.model.save(os.path.join(self.best_model_save_path, "best_model")) with open(f"{self.best_model_save_path}/best_reward.npy", 'wb') as f: np.save(f, np.array([self.best_reward])) # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) """ Add prefix """ # Add to current Logger self.logger.record(f"eval/{self.prefix}_mean_reward", float(mean_reward)) # self.logger.record(f"eval/{self.prefix}_mean_ep_length", mean_ep_length) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, "best_model")) self.best_mean_reward = mean_reward self.update_best_reward(self.eval_env.envs[0].robot.robot_id, self.best_mean_reward) # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def do_eval(self): # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) return evaluate_policy(self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, deterministic=self.deterministic, global_step=self.num_timesteps, commit_logs=None, debug=self.debug, file_log_path=self.log_path, main_prefix=self.prefix, verbose=self.verbose)
def _on_step(self) -> bool: if not (self.eval_freq > 0 and self.n_calls % self.eval_freq == 0): return True # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) if self.model is None or self.model.policy is None: raise ValueError("Model/policy is None.") env = self.eval_env.envs[0] results = [] for _ in range(self.n_eval_episodes): env.reset() ep_results = util.eval_episode( self.model.policy, self.model.policy.mlp_extractor, env, self.cfg != "none", ) results.append(ep_results) episode_rewards = [r["total_reward"] for r in results] episode_lengths = [r["steps"] for r in results] bn_activations = np.concatenate([r["bn_activations"] for r in results]) entropy = util.get_entropy(bn_activations) self.writer.add_scalar("entropy", entropy, self.num_timesteps) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(sum(episode_rewards) / len(episode_rewards)) self.evaluations_length.append(np.mean(episode_lengths)) np.savez( self.log_path / "data", timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, ) mean_reward, _ = np.mean(episode_rewards), np.std(episode_rewards) mean_ep_length, _ = np.mean(episode_lengths), np.std(episode_lengths) self.writer.add_scalar("mean_reward", float(mean_reward), self.num_timesteps) self.writer.add_scalar("mean_ep_length", mean_ep_length, self.num_timesteps) self.writer.add_scalar("rate", self.num_timesteps, self.num_timesteps) torch.save( self.model.policy.state_dict(), self.log_path / f"model-{self.num_timesteps}.pt", ) return True
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'): print('record_video function') # Wrap the env in a Vec Video Recorder local_eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) sync_envs_normalization(train_env, local_eval_env) local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath, record_video_trigger=lambda step: step == 0, video_length=videoLength, name_prefix=prefix) obs = local_eval_env.reset() for _ in range(videoLength): action, _ = model.predict(obs) obs, _, _, _ = local_eval_env.step(action) # Close the video recorder local_eval_env.close()
def _on_step(self): sync_envs_normalization(self.training_env, self.eval_env) episode_rewards, episode_lengths = evaluate_policy(self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=False, deterministic=self.deterministic, return_episode_rewards=True) episode_reward_mean, std_reward = np.mean( episode_rewards), np.std(episode_rewards) mean_ep_length, std_ep_length = np.mean( episode_lengths), np.std(episode_lengths) report( episode_reward_mean=episode_reward_mean, std_reward=std_reward, mean_ep_length=mean_ep_length, std_ep_length=std_ep_length )
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # Reset success rate buffer self._is_success_buffer = [] episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, warn=self.warn, callback=self._log_success_callback, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, **kwargs, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, "best_model")) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize if self.model.get_vec_normalize_env() is not None: try: sync_envs_normalization(self.training_env, self.eval_env) except AttributeError: raise AssertionError( "Training and eval env are not wrapped the same way, " "see https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html#evalcallback " "and warning above.") # Reset success rate buffer self._is_success_buffer = [] episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, warn=self.warn, callback=self._log_success_callback, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, **kwargs, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) # Dump log so the evaluation results are printed with the correct timestep self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") self.logger.dump(self.num_timesteps) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, "best_model")) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # Reset success rate buffer self._is_success_buffer = [] #episodes_rewards, episodes_lengths, episodes_powers, episodes_comfort_violations, episodes_comfort_penalties, episodes_power_penalties episodes_data = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, callback=None, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append( episodes_data['episodes_rewards']) self.evaluations_length.append( episodes_data['episodes_lengths']) self.evaluations_power_consumption.append( episodes_data['episodes_powers']) self.evaluations_comfort_violation.append( episodes_data['episodes_comfort_violations']) self.evaluations_comfort_penalty.append( episodes_data['episodes_comfort_penalties']) self.evaluations_power_penalty.append( episodes_data['episodes_power_penalties']) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, ep_powers=self.evaluations_power_consumption, ep_comfort_violations=self.evaluations_comfort_violation, episodes_comfort_penalties=self.evaluations_comfort_penalty, episodes_power_penalties=self.evaluations_power_penalty, **kwargs, ) mean_reward, std_reward = np.mean( episodes_data['episodes_rewards']), np.std( episodes_data['episodes_rewards']) mean_ep_length, std_ep_length = np.mean( episodes_data['episodes_lengths']), np.std( episodes_data['episodes_lengths']) self.evaluation_metrics['mean_rewards'] = mean_reward self.evaluation_metrics['std_rewards'] = std_reward self.evaluation_metrics['mean_ep_length'] = mean_ep_length self.evaluation_metrics['mean_power_consumption'] = np.mean( episodes_data['episodes_powers']) self.evaluation_metrics['comfort_violation(%)'] = np.mean( episodes_data['episodes_comfort_violations']) self.evaluation_metrics['comfort_penalty'] = np.mean( episodes_data['episodes_comfort_penalties']) self.evaluation_metrics['power_penalty'] = np.mean( episodes_data['episodes_power_penalties']) if self.verbose > 0: print( f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}") # Add to current Logger for key, metric in self.evaluation_metrics.items(): self.logger.record('eval/' + key, metric) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join( self.best_model_save_path, 'model.zip')) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
def _on_step(self) -> bool: if self.n_calls > 0 and (self.n_calls - 1) % self.eval_freq == 0: self.old_params = [ param.clone() for param in self.model.policy.parameters() ] if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # Reset success rate buffer self._is_success_buffer = [] episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, warn=self.warn, callback=self._log_success_callback, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) kwargs = {} # Save success log if present if len(self._is_success_buffer) > 0: self.evaluations_successes.append(self._is_success_buffer) kwargs = dict(successes=self.evaluations_successes) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, **kwargs, ) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}") print( f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}" ) # Add to current Logger self.logger.record("eval/mean_reward", float(mean_reward)) self.logger.record("eval/mean_ep_length", mean_ep_length) if len(self._is_success_buffer) > 0: success_rate = np.mean(self._is_success_buffer) if self.verbose > 0: print(f"Success rate: {100 * success_rate:.2f}%") self.logger.record("eval/success_rate", success_rate) # Dump log so the evaluation results are printed with the correct timestep self.logger.record("time/total timesteps", self.num_timesteps, exclude="tensorboard") self.logger.dump(self.num_timesteps) if mean_reward >= self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, "checkpoint")) self.best_mean_reward = mean_reward os.makedirs(self.log_path, exist_ok=True) save_path = os.path.join(self.log_path, "checkpoint") self.save_folders.append(save_path) model_parameters = self.model.policy.state_dict() grads = OrderedDict([ (name, param.grad) for name, param in model_parameters.items() ]) torch.save(model_parameters, os.path.join(self.log_path, "parameters.th")) torch.save(grads, os.path.join(self.log_path, "grads.th")) if self.old_params is not None: delta = OrderedDict([ (name, param - old_param) for old_param, (name, param) in zip( self.old_params, model_parameters.items()) ]) torch.save(delta, os.path.join(self.log_path, "prev_step.th")) # Trigger callback if needed if self.callback is not None: return self._on_event() return True