def test_model(self) -> Tuple[List[float], list]: ray.init(logging_level=logging.INFO, ignore_reinit_error=True) agent = DQNTrainer(self.config, env=custom_env_name) weights = torch.load( self.params.model_dir / "trained_model.pt", map_location=lambda storage, loc: storage, ) agent.set_weights({"default_policy": weights}) rewards = [] longest_screens = [] for i in range(self.params.num_testing_episodes): screens = [] try: logger.info("Iteration: {}", i) state = self.env.reset() done = False cumulative_reward = 0 while not done: action = agent.compute_action(state) state, reward, done, _ = self.env.step(action) screen = self.env.render(mode="rgb_array") screens.append(screen) cumulative_reward += reward time.sleep(0.01) logger.info("Iteration: {}, Reward: {}", i, cumulative_reward) rewards.append(cumulative_reward) except KeyboardInterrupt: logger.info("Testing was interrupted") break if len(screens) > len(longest_screens): longest_screens = screens self.env.close() ray.shutdown() return rewards, longest_screens
# dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") result_dqn = dqn_trainer.train() print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") result_ppo = ppo_trainer.train() print(pretty_print(result_ppo)) # Test passed gracefully. if args.as_test and \ result_dqn["episode_reward_mean"] > args.stop_reward and \ result_ppo["episode_reward_mean"] > args.stop_reward: print("test passed (both agents above requested reward)") quit(0) # swap weights to synchronize dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"])) ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"])) # Desired reward not reached. if args.as_test: raise ValueError("Desired reward ({}) not reached!".format( args.stop_reward))