コード例 #1
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            # sync_envs_normalization(self.training_env, self.eval_env)
            new_seed = 0  # seeding.create_seed()
            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env_list[0],
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                plot_before_reset=self.plot_results,
                env_seed=new_seed)
            MCQI_episode_rewards, MCQI_episode_lengths = evaluate_baseline(
                self.eval_env_list[1],
                C_ALGO='MCQI',
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                plot_before_reset=False,
                env_seed=new_seed)
            RR_episode_rewards, RR_episode_lengths = evaluate_baseline(
                self.eval_env_list[2],
                C_ALGO='RR',
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                plot_before_reset=False,
                env_seed=new_seed)
            PF_episode_rewards, PF_episode_lengths = evaluate_baseline(
                self.eval_env_list[3],
                C_ALGO='PF',
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                plot_before_reset=False,
                env_seed=new_seed)
            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                self.MCQI_eval_results.append(MCQI_episode_rewards)
                self.RR_eval_results.append(RR_episode_rewards)
                self.PF_eval_results.append(PF_episode_rewards)
                np.savez(self.log_path,
                         timesteps=self.evaluations_timesteps,
                         results=self.evaluations_results,
                         ep_lengths=self.evaluations_length,
                         mcqi_results=self.MCQI_eval_results,
                         rr_results=self.RR_eval_results,
                         pf_results=self.PF_eval_results)

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"episode_reward={mean_reward:.2f}  "
                      f"Eval num_timesteps={self.num_timesteps}")
                print(f"MCQI_reward   ={np.mean(MCQI_episode_rewards):.2f} ")
                print(f"RR_reward     ={np.mean(RR_episode_rewards):.2f} ")
                print(f"PF_reward     ={np.mean(PF_episode_rewards):.2f} ")

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, 'best_model'))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
コード例 #2
0
#                                                         render=False,
#                                                         deterministic=False,
#                                                         return_episode_rewards=True,
#                                                         plot_before_reset=plot_results,
#                                                         env_seed=seed)

eval_env = DummyVecEnv(
    [make_env('ransim-v0', 0, log_dir=log_dir, env_kwargs=env_kwargs)])
for i in range(len(user_list_list)):
    seed = i
    no_of_users_list = user_list_list[i]
    episode_rewards, episode_lengths = evaluate_baseline(
        env=eval_env,
        C_ALGO='Random',
        n_eval_episodes=n_eval_episodes,
        render=False,
        deterministic=True,
        return_episode_rewards=True,
        plot_before_reset=plot_results,
        env_seed=seed,
        no_of_users_list=no_of_users_list)

eval_env = DummyVecEnv(
    [make_env('ransim-v0', 0, log_dir=log_dir, env_kwargs=env_kwargs)])
for i in range(len(user_list_list)):
    seed = i
    no_of_users_list = user_list_list[i]
    episode_rewards, episode_lengths = evaluate_baseline(
        env=eval_env,
        C_ALGO='MCQI',
        n_eval_episodes=n_eval_episodes,
        render=False,
コード例 #3
0
# model = A2C.load("best_model", env=eval_env)
# model = A2C('MlpPolicy', eval_env_list[0], verbose=1)
# model.learn(total_timesteps = 1000)
# episode_rewards, episode_lengths = evaluate_policy(model,  eval_env_list[1],
#                                                                n_eval_episodes=n_eval_episodes,
#                                                                render=False,
#                                                                deterministic=True,
#                                                                return_episode_rewards=True,
#                                                                plot_before_reset=plot_results,
#                                                                env_seed= seed)
episode_rewards, episode_lengths = evaluate_baseline(
    env=eval_env_list[2],
    C_ALGO='MCQI',
    n_eval_episodes=n_eval_episodes,
    render=False,
    deterministic=True,
    return_episode_rewards=True,
    plot_before_reset=plot_results,
    env_seed=seed)

episode_rewards, episode_lengths = evaluate_baseline(
    env=eval_env_list[3],
    C_ALGO='RR',
    n_eval_episodes=n_eval_episodes,
    render=False,
    deterministic=True,
    return_episode_rewards=True,
    plot_before_reset=plot_results,
    env_seed=seed)