def learn_from_batch_experiece(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        if str(self.device) == "cuda":
            td_target =  reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                    torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist()
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()

        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
Ejemplo n.º 2
0
    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()
        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
Ejemplo n.º 3
0
    def learn_from_batch_experience(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)

        reward_batch = np.array(batch_xp.reward)
        if self.params['clip_rewards']:  # Clip the rewards
            reward_batch = np.sign(reward_batch)

        next_obs_batch = np.array(batch_xp.next_obs)/255.0
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']:
            if self.step_num % self.params['target_network_update_freq'] == 0:
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q_target(next_obs_batch).max(
                    1)[0].data.to(self.device).numpy()
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q(next_obs_batch).detach().max(
                    1)[0].data.to(self.device).numpy()

        td_target = torch.from_numpy(td_target).to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.writer.add_scalar('DQL/td_error', td_error.mean(), self.step_num)
        self.Q_optimizer.step()
    def learn_from_batch_experience(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs) / 255.0  # Scale/Divide by max limit of obs's dtype. 255 for uint8
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        # Clip the rewards
        if self.params["clip_rewards"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs) / 255.0  # Scale/Divide by max limit of obs' dtype. 255 for uint8
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']:
            #if self.training_steps_completed % self.params['target_network_update_freq'] == 0:
            if self.step_num % self.params['target_network_update_freq'] == 0:
                # The *update_freq is the Num steps after which target net is updated.
                # A schedule can be used instead to vary the update freq.
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q_target(next_obs_batch).max(1)[0].data
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q(next_obs_batch).detach().max(1)[0].data

        td_target = td_target.to(device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss( self.Q(obs_batch).gather(1, action_idx.view(-1, 1)),
                                                       td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num)
        self.Q_optimizer.step()
Ejemplo n.º 5
0
    def learn_from_batch_experience(self, experiences):

        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs) / 255.0
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)

        if self.params["clip_reward"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs) / 255.0
        done_batch = np.array(batch_xp.done)

        if torch.cuda.is_available():
            if self.params['use_target_network']:
                td_target =  reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                        torch.max(self.Q_target(next_obs_batch).detach(),1)[0].data.tolist()
            else:
                td_target =  reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                        torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist()
        else:
            if self.params['use_target_network']:
                if self.step_num % self.params[
                        'target_network_update_frequency'] == 0:
                    self.Q_target.load_state_dict(self.Q.state_dict())
                    td_target = reward_batch + ~done_batch *\
                        np.tile(self.gamma, len(next_obs_batch)) * \
                            self.Q_target(next_obs_batch).max(1)[0].data
                else:
                    td_target = reward_batch + ~done_batch * \
                        np.tile(self.gamma, len(next_obs_batch)) * \
                            self.Q(next_obs_batch).detach().max(1)[0].data

        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
Ejemplo n.º 6
0
    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs) / 255.0
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)

        if self.params["clip_reward"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs) / 255.0
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']:
            if self.step_num % self.params[
                    'target_network_update_frequency'] == 0:
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch *\
                        np.tile(self.gamma, len(next_obs_batch)) * \
                        torch.max(self.Q_target(next_obs_batch),1)[0].data.tolist()
            td_target = torch.from_numpy(td_target)

        else:
            td_target = reward_batch + ~done_batch * \
                        np.tile(self.gamma, len(next_obs_batch)) * \
                        torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist()
            td_target = torch.from_numpy(td_target)

        td_target = td_target.to(device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1, action_idx.view(-1, 1)),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
Ejemplo n.º 7
0
    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)/255.0
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)

        if self.params["clip_reward"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs)/255.0
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']: # se aharan 2000 interaciones y parara para descanzar y el 'td_target' actualizara el estado del dicionario y guardar lo aprendido
            if self.step_num % self.params['target_network_update_frequency'] == 0:
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch *\
                        np.tile(self.gamma, len(next_obs_batch)) * \
                        self.Q_target(next_obs_batch).max(1)[0].data.numpy()
        else:
            td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()# ~done_batch : Solo hara la suma si no a terminado

        td_target = torch.from_numpy(td_target) # convertimos a un tensor para operar
        #td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
                    self.Q(obs_batch).gather(1,action_idx.view(-1,1).long()),
                    td_target.float().unsqueeze(1))        
        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()# hago un paso adelante para que la neurona aprenda
Ejemplo n.º 8
0
        self.Q_optimizer.step()


if __name__ == "__main__":
    environment = gym.make("CartPole-v0")
    agent = SwallowQLearner(environment)
    first_episode = True
    episode_rewards = list()
    for episode in range(MAX_NUM_EPISODES):
        obs = environment.reset()
        total_reward = 0.0
        for step in range(STEPS_PER_EPISODE):
            #environment.render()
            action = agent.get_action(obs)
            next_obs, reward, done, info = environment.step(action)
            agent.memory.store(Experience(obs, action, reward, next_obs, done))
            agent.learn(obs, action, reward, next_obs)

            obs = next_obs
            total_reward += reward

            if done is True:
                if first_episode:
                    max_reward = total_reward
                    first_episode = False
                episode_rewards.append(total_reward)
                if total_reward > max_reward:
                    max_reward = total_reward
                print(
                    "\nEpisodio#{} finalizado con {} iteraciones. Recompensa = {}, Recompensa media = {}, Mejor recompensa = {}"
                    .format(episode, step + 1, total_reward,
Ejemplo n.º 9
0
            print("ERROR: no existe ningún modelo entrenado para este entorno. Empezamos desde cero")


    episode = 0
    while global_step_num < agent_params['max_training_steps']:
        obs = environment.reset()
        total_reward = 0.0
        done = False
        step = 0
        while not done:  # mientras no haya terminado
            if env_conf['render'] or args.render:
                environment.render()
            
            action = agent.get_action(obs)
            next_obs, reward, done, info = environment.step(action)
            agent.memory.store(Experience(obs, action, reward, next_obs, done))# implementamos la experiencia en memoria
            
            obs = next_obs
            total_reward += reward
            step += 1
            global_step_num += 1
            if done is True:
                episode += 1
                episode_rewards.append(total_reward)
            
                if total_reward > agent.best_reward: # si tenemos un mejor recompenza
                    agent.best_reward = total_reward
                
                if np.mean(episode_rewards) > previous_checkpoint_mean_ep_rew: 
                    num_improved_episodes_before_checkpoint += 1