def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize # sync_envs_normalization(self.training_env, self.eval_env) new_seed = 0 # seeding.create_seed() episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env_list[0], n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, plot_before_reset=self.plot_results, env_seed=new_seed) MCQI_episode_rewards, MCQI_episode_lengths = evaluate_baseline( self.eval_env_list[1], C_ALGO='MCQI', n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, plot_before_reset=False, env_seed=new_seed) RR_episode_rewards, RR_episode_lengths = evaluate_baseline( self.eval_env_list[2], C_ALGO='RR', n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, plot_before_reset=False, env_seed=new_seed) PF_episode_rewards, PF_episode_lengths = evaluate_baseline( self.eval_env_list[3], C_ALGO='PF', n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, plot_before_reset=False, env_seed=new_seed) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) self.MCQI_eval_results.append(MCQI_episode_rewards) self.RR_eval_results.append(RR_episode_rewards) self.PF_eval_results.append(PF_episode_rewards) np.savez(self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, mcqi_results=self.MCQI_eval_results, rr_results=self.RR_eval_results, pf_results=self.PF_eval_results) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) self.last_mean_reward = mean_reward if self.verbose > 0: print(f"episode_reward={mean_reward:.2f} " f"Eval num_timesteps={self.num_timesteps}") print(f"MCQI_reward ={np.mean(MCQI_episode_rewards):.2f} ") print(f"RR_reward ={np.mean(RR_episode_rewards):.2f} ") print(f"PF_reward ={np.mean(PF_episode_rewards):.2f} ") if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, 'best_model')) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True
# render=False, # deterministic=False, # return_episode_rewards=True, # plot_before_reset=plot_results, # env_seed=seed) eval_env = DummyVecEnv( [make_env('ransim-v0', 0, log_dir=log_dir, env_kwargs=env_kwargs)]) for i in range(len(user_list_list)): seed = i no_of_users_list = user_list_list[i] episode_rewards, episode_lengths = evaluate_baseline( env=eval_env, C_ALGO='Random', n_eval_episodes=n_eval_episodes, render=False, deterministic=True, return_episode_rewards=True, plot_before_reset=plot_results, env_seed=seed, no_of_users_list=no_of_users_list) eval_env = DummyVecEnv( [make_env('ransim-v0', 0, log_dir=log_dir, env_kwargs=env_kwargs)]) for i in range(len(user_list_list)): seed = i no_of_users_list = user_list_list[i] episode_rewards, episode_lengths = evaluate_baseline( env=eval_env, C_ALGO='MCQI', n_eval_episodes=n_eval_episodes, render=False,
# model = A2C.load("best_model", env=eval_env) # model = A2C('MlpPolicy', eval_env_list[0], verbose=1) # model.learn(total_timesteps = 1000) # episode_rewards, episode_lengths = evaluate_policy(model, eval_env_list[1], # n_eval_episodes=n_eval_episodes, # render=False, # deterministic=True, # return_episode_rewards=True, # plot_before_reset=plot_results, # env_seed= seed) episode_rewards, episode_lengths = evaluate_baseline( env=eval_env_list[2], C_ALGO='MCQI', n_eval_episodes=n_eval_episodes, render=False, deterministic=True, return_episode_rewards=True, plot_before_reset=plot_results, env_seed=seed) episode_rewards, episode_lengths = evaluate_baseline( env=eval_env_list[3], C_ALGO='RR', n_eval_episodes=n_eval_episodes, render=False, deterministic=True, return_episode_rewards=True, plot_before_reset=plot_results, env_seed=seed)