if args.test: env_conf["episodic_life"] = False reward_type = "LIFE" if env_conf["episodic_life"] else "GAME" custom_region_available = False for key, value in env_conf["useful_region"].items(): if key in args.env: env_conf["useful_region"] = value custom_region_available = True break if custom_region_available is not True: env_conf["useful_region"] = env_conf["useful_region"]["Default"] print("Configuración a utilizar:", env_conf) atari_env = False for game in Atari.get_games_list(): if game.replace("_", "") in args.env.lower(): atari_env = True if atari_env: environment = Atari.make_env(args.env, env_conf) else: environment = env_utils.ResizeReshapeFrames(gym.make(args.env)) obs_shape = environment.observation_space.shape action_shape = environment.action_space.n agent_params = manager.get_agent_params() agent_params["test"] = args.test agent_params["clip_reward"] = env_conf["clip_reward"] agent = DeepQLearner(obs_shape, action_shape, agent_params)
def run(self): ## Cargar datos del entorno donde entrenar custom_region_available = False for key, value in self.env_conf["useful_region"].items(): if key in args.env: self.env_conf["useful_region"] = value custom_region_available = True break if custom_region_available is not True: self.env_conf["useful_region"] = self.env_conf["useful_region"][ "Default"] atari_env = False for game in Atari.get_games_list(): if game.replace("_", "") in args.env.lower(): atari_env = True if atari_env: self.env = Atari.make_env(self.env_name, self.env_conf) else: self.env = gym.make(self.env_name) ## Configurar la política y parámetros del actor y del crítico self.state_shape = self.env.observation_space.shape if isinstance(self.env.action_space.sample(), int): # Espacio de acciones Discreto self.action_shape = self.env.action_space.n self.policy = self.discrete_policy self.continuous_action_space = False else: # Espacio de acciones contínuas self.action_shape = self.env.action_space.shape[0] self.policy = self.multi_variate_gaussian_policy self.critic_shape = 1 if len(self.state_shape ) == 3: #Imagen de pantalla como input del agente y el crítico if self.continuous_action_space: # Espacio de acciones contínuas self.actor = DeepActor(self.state_shape, self.action_shape, device).to(device) else: # Espacio de acciones discretas self.actor = DeepDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = DeepCritic(self.state_shape, self.critic_shape, device).to(device) else: # Vector de cierta dimensión como input del agente y del crítico if self.continuous_action_space: # Espacio de acciones contínuas #self.actor_critic = SwallowActorCritic(slf.state_shape, self.action_shape, self.critic_shape, device).to(device) self.actor = SwallowActor(self.state_shape, self.action_shape, device).to(device) else: # Espacio de acciones discretas self.actor = SwallowDiscreteActor(self.state_shape, self.action_shape, device).to(device) self.critic = SwallowCritic(self.state_shape, self.critic_shape, device).to(device) self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=self.params["learning_rate"]) self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=self.params["learning_rate"]) ## Fase de entrenamiento del agente inteligente con A2C episode_rewards = list() previous_checkpoint_mean_ep_rew = self.best_mean_reward num_improved_episodes_before_checkpoint = 0 if self.params['load_trained_model']: try: self.load() previous_checkpoint_mean_ep_rew = self.best_mean_reward except FileNotFoundError: print( "ERROR: no existe ningún modelo entrenado para este entorno. Empezamos desde cero" ) if args.test: print( "FATAL: no hay modelo salvado y no podemos proceder al modo testing. Pulsa cualquier tecla para volver a empezar" ) input() else: print( "WARNING: no hay ningun modelo para este entorno. Pulsa cualquier tecla para volver a empezar..." ) for episode in range(self.params["max_num_episodes"]): obs = self.env.reset() done = False ep_reward = 0.0 step_num = 0 while not done: action = self.get_action(obs) next_obs, reward, done, _ = self.env.step(action) self.rewards.append(reward) ep_reward += reward step_num += 1 if not args.test and ( step_num > self.params["learning_step_thresh"] or done): self.learn(next_obs, done) step_num = 0 if done: episode_rewards.append(ep_reward) if ep_reward > self.best_reward: self.best_reward = ep_reward if np.mean(episode_rewards ) > previous_checkpoint_mean_ep_rew: num_improved_episodes_before_checkpoint += 1 if num_improved_episodes_before_checkpoint >= self.params[ 'save_freq']: previous_checkpoint_mean_ep_rew = np.mean( episode_rewards) self.best_mean_reward = np.mean(episode_rewards) self.save() num_improved_episodes_before_checkpoint = 0 obs = next_obs self.global_step_num += 1 if args.render: self.env.render() print( "\n {}: Episodio #{}. Con {} estados: recompensa media = {:.2f}, mejor recompensa = {}" .format(self.actor_name, episode, ep_reward, np.mean(episode_rewards), self.best_reward)) writer.add_scalar(self.actor_name + "/reward", reward, self.global_step_num) writer.add_scalar(self.actor_name + "/ep_reward", ep_reward, self.global_step_num) writer.add_scalar(self.actor_name + "/mean_ep_reward", np.mean(episode_rewards), self.global_step_num) writer.add_scalar(self.actor_name + "/max_ep_reward", self.best_reward, self.global_step_num)