Ejemplo n.º 1
0
    def __init__(self, state_shape, action_shape, params):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['lr']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0

        if len(self.state_shape) == 1:
            self.DQN = SLP
        elif len(self.state_shape) == 3:
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape, device).to(device)
        self.Q.apply(utils.weights_initializer.xavier)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)
        if self.params['use_target_network']:
            self.Q_target = self.DQN(state_shape, action_shape,
                                     device).to(device)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params["epsilon_max"]
        self.epsilon_min = params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0
        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_capacity']))
    def __init__(
        self,
        environment,
        learning_rate=0.005,
        gamma=0.98
    ):  # metodo de inicializacion y self es una referencia del propio objecto
        self.obs_shape = environment.observation_space.shape  # me quedo los valores (tamaño, mas alto y mas bajo)

        self.action_shape = environment.action_space.n  # numero de acciones
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(
            self.Q.parameters(), lr=learning_rate)  # lr = radio de aprendisaje

        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q  # politica de actuacion

        self.memory = ExperienceMemory(capacity=int(1e6))
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
Ejemplo n.º 3
0
    def __init__(self, obs_shape, action_shape, hidden_shape, params):

        self.params = params
        self.gamma = self.params["gamma"]
        self.delta = self.params["delta"]
        self.learning_rate = self.params["learning_rate"]
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        self.action_shape = action_shape

        self.Q = CNN(obs_shape, action_shape, hidden_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = self.params["epsilon_max"]
        self.epsilon_min = self.params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params["epsilon_decay_final_step"])

        self.memory = ExperienceMemory(self.params["memory"])

        self.total_trainings = 0
        self.step_num = 0
Ejemplo n.º 4
0
    def __init__(self, obs_shape, action_shape, params):  # metodo de inicializacion y self es una referencia del propio objecto
       
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['learning_rate']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        #self.MAX_NUM_EPISODES = self.params['max_num_episodes']
        #self.STEPS_PER_EPISODE = self.params['steps_per_episode']
        self.action_shape = action_shape

        if len(obs_shape)  == 1: ## Solo tenemos una dimensión del espacio de observaciones
            self.DQN = SLP
        elif len(obs_shape) == 3: ## El estado de observaciones es una imagen/3D
            self.DQN = CNN

        self.Q = self.DQN(obs_shape, action_shape, device).to(device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr = self.learning_rate) # lr = radio de aprendisaje
        
        if self.params['use_target_network']:
            self.Q_target = self.DQN(obs_shape, action_shape, device).to(device)
     
        self.policy = self.epsilon_greedy_Q # politica de actuacion
        self.epsilon_max = self.params['epsilon_max']
        self.epsilon_min = self.params['epsilon_min']
        self.epsilon_decay = LinearDecaySchedule(initial_value = self.epsilon_max,
                                                 final_value = self.epsilon_min, 
                                                 max_steps = self.params['epsilon_decay_final_step'])
        self.step_num = 0
        

        self.memory = ExperienceMemory(capacity = int(self.params['experience_memory_size']))        
    def __init__(self, state_shape, action_shape, params):
        """
        self.Q is the Action-Value function. This agent represents Q using a Neural Network
        If the input is a single dimensional vector, uses a Single-Layer-Perceptron else if the input is 3 dimensional
        image, use a Convolutional-Neural-Network

        :param state_shape: Shape (tuple) of the observation/state
        :param action_shape: Shape (number) of the discrete action space
        :param params: A dictionary containing various Agent configuration parameters and hyper-parameters
        """
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']  # Agent's discount factor
        self.learning_rate = self.params['lr']  # Agent's Q-learning rate
        self.best_mean_reward = -float(
            "inf")  # Agent's personal best mean episode reward
        self.best_reward = -float("inf")
        self.training_steps_completed = 0  # Number of training batch steps completed so far

        if len(self.state_shape
               ) == 1:  # Single dimensional observation/state space
            self.DQN = SLP
        elif len(self.state_shape) == 3:  # 3D/image observation/state
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape, device).to(device)
        self.Q.apply(utils.weights_initializer.xavier)

        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)
        if self.params['use_target_network']:
            self.Q_target = self.DQN(state_shape, action_shape,
                                     device).to(device)
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params["epsilon_max"]
        self.epsilon_min = params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_capacity']
                         ))  # Initialize an Experience memory with 1M capacity
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape
        self.action_shape = environment.action_space.n
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.Q = SLP(self.obs_shape, self.action_shape, self.device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE)

        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
        self.memory = ExperienceMemory(capacity=int(1e5))
Ejemplo n.º 7
0
    def __init__(self, state_shape, action_shape, params, writer, device="cpu"):
        """
        self.Q is the Action-value function.This agent represents Q using a Neural Network
        If the input is a single dimensional vector, use a Single Layer Perceptron else if the input is 3 dimensional image, use a Convolutional Neural Network
        :param state_shape: Shape (tuple) of the observation/state
        :param action_shape: Shape (number) of the discrete action space
        :param params: A dictionary containing various Agent configuration parameters and hyper-parameters 
        """
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['lr']
        self.best_mean_reward = -float('inf')
        self.best_reward = -float('inf')
        self.training_steps_completed = 0
        self.writer = writer
        self.device = device

        if len(self.state_shape) == 1:
            self.DQN = SLP
        elif len(self.state_shape) == 3:
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape,
                          self.device).to(self.device)
        self.Q.apply(utils.weights_initializer.xavier)
        self.Q_optimizer = torch.optim.Adam(
            self.Q.parameters(), lr=self.learning_rate)

        if self.params['use_target_network']:
            self.Q_target = self.DQN(
                state_shape, action_shape, self.device).to(self.device)
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params['epsilon_max']
        self.epsilon_min = params['epsilon_min']
        self.epsilon_decay = LinearDecayScheduler(
            initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(capacity=int(
            self.params['experience_memory_capacity']))
Ejemplo n.º 8
0
    def __init__(self, obs_shape, action_shape, params):

        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['learning_rate']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        self.action_shape = action_shape

        if len(
                obs_shape
        ) == 1:  ## Solo tenemos una dimensión del espacio de observaciones
            self.DQN = SLP
        elif len(obs_shape
                 ) == 3:  ## El estado de observaciones es una imagen/3D
            self.DQN = CNN

        self.Q = self.DQN(obs_shape, action_shape, device).to(device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)

        if self.params['use_target_network']:
            self.Q_target = self.DQN(obs_shape, action_shape,
                                     device).to(device)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = self.params['epsilon_max']
        self.epsilon_min = self.params['epsilon_min']
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_size']))
Ejemplo n.º 9
0
class DeepQLearner(object):
    def __init__(self, obs_shape, action_shape, params):  # metodo de inicializacion y self es una referencia del propio objecto
       
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['learning_rate']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        #self.MAX_NUM_EPISODES = self.params['max_num_episodes']
        #self.STEPS_PER_EPISODE = self.params['steps_per_episode']
        self.action_shape = action_shape

        if len(obs_shape)  == 1: ## Solo tenemos una dimensión del espacio de observaciones
            self.DQN = SLP
        elif len(obs_shape) == 3: ## El estado de observaciones es una imagen/3D
            self.DQN = CNN

        self.Q = self.DQN(obs_shape, action_shape, device).to(device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr = self.learning_rate) # lr = radio de aprendisaje
        
        if self.params['use_target_network']:
            self.Q_target = self.DQN(obs_shape, action_shape, device).to(device)
     
        self.policy = self.epsilon_greedy_Q # politica de actuacion
        self.epsilon_max = self.params['epsilon_max']
        self.epsilon_min = self.params['epsilon_min']
        self.epsilon_decay = LinearDecaySchedule(initial_value = self.epsilon_max,
                                                 final_value = self.epsilon_min, 
                                                 max_steps = self.params['epsilon_decay_final_step'])
        self.step_num = 0
        

        self.memory = ExperienceMemory(capacity = int(self.params['experience_memory_size']))        
 
 
    def get_action(self, obs):
        obs = np.array(obs)
        obs = obs / 255.0 # esto hara que los valores esten entre 0 y 1
        if len(obs.shape) == 3: # tenemos una imagen
            if obs.shape[2] < obs.shape[0]: # WxHxC -> esto nos cambiara al siguiente order de la imagens Color x Altura x ancho
                obs = obs.reshape(obs.shape[2], obs.shape[1], obs.shape[0]) # reordenamos el tamaño
            obs = np.expand_dims(obs, 0)   # en vez de tener el array tendremos un bash expandido
        return self.policy(obs)

        
    def epsilon_greedy_Q(self, obs):
        writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num), self.step_num)
        self.step_num +=1
        if random.random() < self.epsilon_decay(self.step_num) and not self.params["test"]:
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(self.Q(obs).data.to(torch.device('cpu')).numpy())   
        return action

    def learn(self, obs, action, reward, next_obs, done):
        if done:
            td_target = reward + 0.0
        else: 
            td_target = reward + self.gamma * torch.max(self.Q(next_obs)) # con torcho maximiza la toma de decion
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target) # Funcio de perdida de valores
        # RADIO DE APRENDIZAJE:
        self.Q_optimizer.zero_grad()
        td_error.backward() # Haciendo el paso hacia atras
        writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num)
        self.Q_optimizer.step() # obtimiza los pesos internos

    def replay_experience(self, batch_size = None):
        """
        Vuelve a jugar usando la experiencia aleatoria almacenada
        :param batch_size: Tamaño de la muestra a tomar de la memoria
        :return: 
        """
        batch_size = batch_size if batch_size is not None else self.params['replay_batch_size']
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch) 
        self.training_steps_completed += 1 # contador de entrenamientos
    
    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)/255.0
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)

        if self.params["clip_reward"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs)/255.0
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']: # se aharan 2000 interaciones y parara para descanzar y el 'td_target' actualizara el estado del dicionario y guardar lo aprendido
            if self.step_num % self.params['target_network_update_frequency'] == 0:
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch *\
                        np.tile(self.gamma, len(next_obs_batch)) * \
                        self.Q_target(next_obs_batch).max(1)[0].data.numpy()
        else:
            td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()# ~done_batch : Solo hara la suma si no a terminado

        td_target = torch.from_numpy(td_target) # convertimos a un tensor para operar
        #td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
                    self.Q(obs_batch).gather(1,action_idx.view(-1,1).long()),
                    td_target.float().unsqueeze(1))        
        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()# hago un paso adelante para que la neurona aprenda

    def save(self, env_name):
        file_name = self.params['save_dir']+"DQL_"+env_name+".ptm"
        agent_state = {"Q": self.Q.state_dict(),
                       "best_mean_reward": self.best_mean_reward,
                       "best_reward": self.best_reward}
        torch.save(agent_state, file_name)
        print("Estado del agente guardado en : ", file_name)

    def load(self, env_name):
        file_name = self.params['load_dir']+"DQL_"+env_name+".ptm"
        agent_state = torch.load(file_name, map_location = lambda storage, loc: storage)
        self.Q.load_state_dict(agent_state["Q"])
        self.Q.to(device)
        self.best_mean_reward = agent_state["best_mean_reward"]
        self.best_reward = agent_state["best_reward"]
        print("Cargado del modelo Q desde ", file_name,
              "que hasta el momento tiene una mejor recompensa media de: ",self.best_mean_reward,
              " y una recompensa máxima de: ", self.best_reward)
Ejemplo n.º 10
0
class DeepQLearner(object):
    def __init__(self, obs_shape, action_shape, params):

        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['learning_rate']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        self.action_shape = action_shape

        if len(
                obs_shape
        ) == 1:  ## Solo tenemos una dimensión del espacio de observaciones
            self.DQN = SLP
        elif len(obs_shape
                 ) == 3:  ## El estado de observaciones es una imagen/3D
            self.DQN = CNN

        self.Q = self.DQN(obs_shape, action_shape, device).to(device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)

        if self.params['use_target_network']:
            self.Q_target = self.DQN(obs_shape, action_shape,
                                     device).to(device)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = self.params['epsilon_max']
        self.epsilon_min = self.params['epsilon_min']
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_size']))

    def get_action(self, obs):
        obs = np.array(obs)
        obs = obs / 255.0
        if len(obs.shape) == 3:  # tenemos una imagen
            if obs.shape[2] < obs.shape[0]:  # WxHxC -> C x H x W
                obs = obs.reshape(obs.shape[2], obs.shape[1], obs.shape[0])
            obs = np.expand_dims(obs, 0)
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num),
                          self.step_num)
        self.step_num += 1
        if random.random() < self.epsilon_decay(
                self.step_num) and not self.params["test"]:
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(obs).data.to(torch.device('cpu')).numpy())
        return action

    def learn(self, obs, action, reward, next_obs, done):
        if done:
            td_target = reward + 0.0
        else:
            td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)
        self.Q_optimizer.zero_grad()
        td_error.backward()
        writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num)
        self.Q_optimizer.step()

    def replay_experience(self, batch_size=None):

        batch_size = batch_size if batch_size is not None else self.params[
            'replay_batch_size']
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)
        self.training_steps_completed += 1

    def learn_from_batch_experience(self, experiences):

        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs) / 255.0
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)

        if self.params["clip_reward"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs) / 255.0
        done_batch = np.array(batch_xp.done)

        if torch.cuda.is_available():
            if self.params['use_target_network']:
                td_target =  reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                        torch.max(self.Q_target(next_obs_batch).detach(),1)[0].data.tolist()
            else:
                td_target =  reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                        torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist()
        else:
            if self.params['use_target_network']:
                if self.step_num % self.params[
                        'target_network_update_frequency'] == 0:
                    self.Q_target.load_state_dict(self.Q.state_dict())
                    td_target = reward_batch + ~done_batch *\
                        np.tile(self.gamma, len(next_obs_batch)) * \
                            self.Q_target(next_obs_batch).max(1)[0].data
                else:
                    td_target = reward_batch + ~done_batch * \
                        np.tile(self.gamma, len(next_obs_batch)) * \
                            self.Q(next_obs_batch).detach().max(1)[0].data

        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()

    def save(self, env_name):
        file_name = self.params['save_dir'] + "DQL_" + env_name + ".ptm"
        agent_state = {
            "Q": self.Q.state_dict(),
            "best_mean_reward": self.best_mean_reward,
            "best_reward": self.best_reward
        }
        torch.save(agent_state, file_name)
        print("Estado del agente guardado en : ", file_name)

    def load(self, env_name):
        file_name = self.params['load_dir'] + "DQL_" + env_name + ".ptm"
        agent_state = torch.load(file_name,
                                 map_location=lambda storage, loc: storage)
        self.Q.load_state_dict(agent_state["Q"])
        self.Q.to(device)
        self.best_mean_reward = agent_state["best_mean_reward"]
        self.best_reward = agent_state["best_reward"]
        print("Cargado del modelo Q desde", file_name,
              "que hasta el momento tiene una mejor recompensa media de: ",
              self.best_mean_reward, " y una recompensa máxima de: ",
              self.best_reward)
class SwallowQLearner(object):
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape
        self.action_shape = environment.action_space.n
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.Q = SLP(self.obs_shape, self.action_shape, self.device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)
        self.gamma = gamma
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * MAX_STEP_PER_EPISODE)

        self.step_num = 0
        self.policy = self.epsilon_greedy_Q
        self.memory = ExperienceMemory(capacity=int(1e5))

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(self.Q(obs).data.to(self.device).cpu().numpy())
        return action

    def learn(self, obs, action, reward, next_obs):
        td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def replay_experience(self, batch_size):
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experiece(experience_batch)

    def learn_from_batch_experiece(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        if str(self.device) == "cuda":
            td_target =  reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                    torch.max(self.Q(next_obs_batch).detach(),1)[0].data.tolist()
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()

        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
class Deep_Q_Learner(object):
    def __init__(self, state_shape, action_shape, params):
        """
        self.Q is the Action-Value function. This agent represents Q using a Neural Network
        If the input is a single dimensional vector, uses a Single-Layer-Perceptron else if the input is 3 dimensional
        image, use a Convolutional-Neural-Network

        :param state_shape: Shape (tuple) of the observation/state
        :param action_shape: Shape (number) of the discrete action space
        :param params: A dictionary containing various Agent configuration parameters and hyper-parameters
        """
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']  # Agent's discount factor
        self.learning_rate = self.params['lr']  # Agent's Q-learning rate
        self.best_mean_reward = -float(
            "inf")  # Agent's personal best mean episode reward
        self.best_reward = -float("inf")
        self.training_steps_completed = 0  # Number of training batch steps completed so far

        if len(self.state_shape
               ) == 1:  # Single dimensional observation/state space
            self.DQN = SLP
        elif len(self.state_shape) == 3:  # 3D/image observation/state
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape, device).to(device)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)
        if self.params['use_target_network']:
            self.Q_target = self.DQN(state_shape, action_shape,
                                     device).to(device)
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_capacity']
                         ))  # Initialize an Experience memory with 1M capacity

    def get_action(self, observation):
        if len(observation.shape) == 3:  # Single image (not a batch)
            if observation.shape[2] < observation.shape[
                    0]:  # Probably observation is in W x H x C format
                # Reshape to C x H x W format as per PyTorch's convention
                observation = observation.reshape(observation.shape[2],
                                                  observation.shape[1],
                                                  observation.shape[0])
            observation = np.expand_dims(observation,
                                         0)  # Create a batch dimension
        return self.policy(observation)

    def epsilon_greedy_Q(self, observation):
        # Decay Epsilon/exploration as per schedule
        writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num),
                          self.step_num)
        self.step_num += 1
        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([i for i in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(observation).data.to(torch.device('cpu')).numpy())

        return action

    def learn(self, s, a, r, s_next, done):
        # TD(0) Q-learning
        if done:  # End of episode
            td_target = reward + 0.0  # Set the value of terminal state to zero
        else:
            td_target = r + self.gamma * torch.max(self.Q(s_next))
        td_error = td_target - self.Q(s)[a]
        # Update Q estimate
        #self.Q(s)[a] = self.Q(s)[a] + self.learning_rate * td_error
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def learn_from_batch_experience(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(
            batch_xp.obs
        ) / 255.0  # Scale/Divide by max limit of obs's dtype. 255 for uint8
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(
            batch_xp.next_obs
        ) / 255.0  # Scale/Divide by max limit of obs' dtype. 255 for uint8
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']:
            if self.training_steps_completed % self.params[
                    'target_network_update_freq'] == 0:
                # The *update_freq is the Num steps after which target net is updated.
                # A schedule can be used instead to vary the update freq.
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q_target(next_obs_batch).max(1)[0].data
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q(next_obs_batch).detach().max(1)[0].data

        td_target = td_target.to(device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1, action_idx.view(-1, 1)),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num)
        self.Q_optimizer.step()

    def replay_experience(self, batch_size=None):
        batch_size = batch_size if batch_size is not None else self.params[
            'replay_batch_size']
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)
        self.training_steps_completed += 1  # Increment the number of training batch steps complemented

    def save(self, env_name):
        file_name = self.params['save_dir'] + "DQL_" + env_name + ".ptm"
        agent_state = {
            "Q": self.Q.state_dict(),
            "best_mean_reward": self.best_mean_reward,
            "best_reward": self.best_reward
        }
        torch.save(agent_state, file_name)
        print("Agent's state saved to ", file_name)

    def load(self, env_name):
        file_name = self.params['load_dir'] + "DQL_" + env_name + ".ptm"
        agent_state = torch.load(file_name,
                                 map_location=lambda storage, loc: storage)
        self.Q.load_state_dict(agent_state["Q"])
        self.Q.to(device)
        self.best_mean_reward = agent_state["best_mean_reward"]
        self.best_reward = agent_state["best_reward"]
        print("Loaded Q model state from", file_name,
              " which fetched a best mean reward of:", self.best_mean_reward,
              " and an all time best reward of:", self.best_reward)
Ejemplo n.º 13
0
class DeepQLearner:
    def __init__(self, state_shape, action_shape, params, writer, device="cpu"):
        """
        self.Q is the Action-value function.This agent represents Q using a Neural Network
        If the input is a single dimensional vector, use a Single Layer Perceptron else if the input is 3 dimensional image, use a Convolutional Neural Network
        :param state_shape: Shape (tuple) of the observation/state
        :param action_shape: Shape (number) of the discrete action space
        :param params: A dictionary containing various Agent configuration parameters and hyper-parameters 
        """
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['lr']
        self.best_mean_reward = -float('inf')
        self.best_reward = -float('inf')
        self.training_steps_completed = 0
        self.writer = writer
        self.device = device

        if len(self.state_shape) == 1:
            self.DQN = SLP
        elif len(self.state_shape) == 3:
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape,
                          self.device).to(self.device)
        self.Q.apply(utils.weights_initializer.xavier)
        self.Q_optimizer = torch.optim.Adam(
            self.Q.parameters(), lr=self.learning_rate)

        if self.params['use_target_network']:
            self.Q_target = self.DQN(
                state_shape, action_shape, self.device).to(self.device)
        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params['epsilon_max']
        self.epsilon_min = params['epsilon_min']
        self.epsilon_decay = LinearDecayScheduler(
            initial_value=self.epsilon_max, final_value=self.epsilon_min, max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0

        self.memory = ExperienceMemory(capacity=int(
            self.params['experience_memory_capacity']))

    def get_action(self, obs):
        obs = np.array(obs)
        obs = obs/255.0
        if len(obs.shape) == 3:
            if obs.shape[2] < obs.shape[0]:
                obs = obs.reshape(obs.shape[2], obs.shape[1], obs.shape[0])
            obs = np.expand_dims(obs, 0)
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        self.writer.add_scalar(
            'DQL/epsilon', self.epsilon_decay(self.step_num), self.step_num)
        self.step_num += 1
        if random.random() < self.epsilon_decay(self.step_num) and not self.params['test']:
            action = random.choice([i for i in range(self.action_shape)])
        else:
            action = np.argmax(self.Q(obs).data.to(self.device).numpy())
        return action

    def learn(self, obs, action, reward, next_obs, done):
        if done:
            td_target = reward + 0.0
        else:
            td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = td_target - self.Q(s)[a]
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def learn_from_batch_experience(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)

        reward_batch = np.array(batch_xp.reward)
        if self.params['clip_rewards']:  # Clip the rewards
            reward_batch = np.sign(reward_batch)

        next_obs_batch = np.array(batch_xp.next_obs)/255.0
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']:
            if self.step_num % self.params['target_network_update_freq'] == 0:
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q_target(next_obs_batch).max(
                    1)[0].data.to(self.device).numpy()
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q(next_obs_batch).detach().max(
                    1)[0].data.to(self.device).numpy()

        td_target = torch.from_numpy(td_target).to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1, action_idx.view(-1, 1)), td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.writer.add_scalar('DQL/td_error', td_error.mean(), self.step_num)
        self.Q_optimizer.step()

    def replay_experience(self, batch_size=None):
        batch_size = batch_size if batch_size is not None else self.params['replay_batch_size']
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)
        self.training_steps_completed += 1

    def save(self, env_name):
        if not exists(self.params['save_dir']):
            makedirs(self.params['save_dir'])
        file_name = join(self.params['save_dir'], 'DQL_' + env_name + '.ptm')
        agent_state = {'Q': self.Q.state_dict(),
                       'best_mean_reward': self.best_mean_reward,
                       'best_reward': self.best_reward
                       }
        torch.save(agent_state, file_name)
        print(f"Agent's state saved to {file_name}")

    def load(self, env_name):
        file_name = self.params['load_dir'] + 'DQL_' + env_name + '.ptm'
        agent_state = torch.load(
            file_name, map_location=lambda storage, loc: storage)
        self.Q.load_state_dict(agent_state['Q'])
        self.Q.to(self.device)
        self.best_mean_reward = agent_state['best_mean_reward']
        self.best_reward = agent_state['best_reward']
        print(
            f'Loaded Q model state from {file_name} which fetched a best mean reward of {self.best_mean_reward:.3f} and an all time best reward of {self.best_reward}')
Ejemplo n.º 14
0
class DeepQLearner(object):
    def __init__(self, obs_shape, action_shape, hidden_shape, params):

        self.params = params
        self.gamma = self.params["gamma"]
        self.delta = self.params["delta"]
        self.learning_rate = self.params["learning_rate"]
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0
        self.action_shape = action_shape

        self.Q = CNN(obs_shape, action_shape, hidden_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = self.params["epsilon_max"]
        self.epsilon_min = self.params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params["epsilon_decay_final_step"])

        self.memory = ExperienceMemory(self.params["memory"])

        self.total_trainings = 0
        self.step_num = 0

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        self.step_num += 1
        if random.random() < self.epsilon_decay(
                self.step_num) and not self.params["test"]:
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(self.Q(obs).detach().numpy())
        return action

    def learn(self, obs, action, reward, next_obs, done):
        if done:
            td_target = reward + torch.tensor(0.0, requires_grad=True)
        else:
            td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs)[0][action], td_target)
        #print(td_target.item(), self.Q(obs)[action].item(), td_error.item())
        #print(reward, td_target.item(), td_error.item())
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def replay_experience(self, batch_size=None):
        """
        Vuelve a jugar usando la experiencia aleatoria almacenada
        :param batch_size: Tamaño de la muestra a tomar de la memoria
        :return:
        """
        batch_size = batch_size if batch_size is not None else self.params[
            "replay_batch_size"]
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)
        self.training_steps_completed += 1
        #print("Replaying {} episodes".format(batch_size))

    def learn_from_batch_experience(self, experiences):
        """ 
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiencias: fragmento  de recuerdos anteriores
        :return:
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        obs_batch = obs_batch
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()
        td_target = torch.from_numpy(td_target)
        action_idx = torch.from_numpy(action_batch)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))
        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()

    def save(self, env_name):
        file_name = self.params["save_dir"] + "DQL_" + env_name + ".ptm"
        agent_state = {
            "Q": self.Q.state_dict(),
            "best_mean_reward": self.best_mean_reward,
            "best_reward": self.best_reward,
            "total_trainings": self.total_trainings
        }
        torch.save(agent_state, file_name)
        print("NN guardada en: ", file_name)

    def load(self, env_name):
        file_name = self.params["load_dir"] + "DQL_" + env_name + ".ptm"
        agent_state = torch.load(file_name,
                                 map_location=lambda storage, loc: storage)
        self.Q.load_state_dict(agent_state["Q"])
        #self.Q.eval()
        self.best_mean_reward = agent_state["best_mean_reward"]
        self.best_reward = agent_state["best_reward"]
        self.total_trainings = agent_state["total_trainings"]
        print(
            "NN cargada desde: {} \nMejor recompensa media: {:.3f}\nMejor recompensa: {:.3f}\nTrains: {}"
            .format(file_name, self.best_mean_reward, self.best_reward,
                    self.total_trainings))
Ejemplo n.º 15
0
class Deep_Q_Learner(object):
    def __init__(self, state_shape, action_shape, params):
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.params = params
        self.gamma = self.params['gamma']
        self.learning_rate = self.params['lr']
        self.best_mean_reward = -float("inf")
        self.best_reward = -float("inf")
        self.training_steps_completed = 0

        if len(self.state_shape) == 1:
            self.DQN = SLP
        elif len(self.state_shape) == 3:
            self.DQN = CNN

        self.Q = self.DQN(state_shape, action_shape, device).to(device)
        self.Q.apply(utils.weights_initializer.xavier)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=self.learning_rate)
        if self.params['use_target_network']:
            self.Q_target = self.DQN(state_shape, action_shape,
                                     device).to(device)

        self.policy = self.epsilon_greedy_Q
        self.epsilon_max = params["epsilon_max"]
        self.epsilon_min = params["epsilon_min"]
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=self.params['epsilon_decay_final_step'])
        self.step_num = 0
        self.memory = ExperienceMemory(
            capacity=int(self.params['experience_memory_capacity']))

    def get_action(self, observation):
        observation = np.array(observation)
        observation = observation / 255
        if len(observation.shape) == 3:
            if observation.shape[2] < observation.shape[0]:
                observation.reshape(observation[2], observation[1],
                                    observation[0])
            observation = np.expand_dims(observation, 0)
        return self.policy(observation)

    def epsilon_greedy_Q(self, observation):
        writer.add_scalar("DQL/epsilon", self.epsilon_decay(self.step_num),
                          self.step_num)
        self.step_num += 1
        if random.random() < self.epsilon_decay(
                self.step_num) and not self.params["test"]:
            action = random.choice([i for i in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(observation).data.to(torch.device('cpu')).numpy())
        return action

    def learn(self, obs, action, reward, obs_next, done):
        if done:
            td_target = reward + 0.0
        else:
            td_target = reward + (self.gamma * torch.max(self.Q(obs_next)))
        td_error = td_target - self.Q(obs)[action]
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def learn_from_batch_experience(self, experiences):
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs) / 255.0
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(obs_batch.reward)
        if self.params["clip_rewards"]:
            reward_batch = np.sign(reward_batch)
        next_obs_batch = np.array(batch_xp.next_obs) / 255.0
        done_batch = np.array(batch_xp.done)

        if self.params['use_target_network']:
            if self.step_num % self.params['target_network_update_freq'] == 0:
                self.Q_target.load_state_dict(self.Q.state_dict())
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q_target(next_obs_batch).max(1)[0].data.cpu().numpy()
        else:
            td_target = reward_batch + ~done_batch * \
                np.tile(self.gamma, len(next_obs_batch)) * \
                self.Q(next_obs_batch).max(1)[0].data.cpu().numpy()

        td_target = torch.from_numpy(td_target).to(device)
        action_idx = torch.from_numpy(action_batch).to(device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1, action_idx.view(-1, 1)),
            td_target.float().unsqueeze(1))
        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        writer.add_scalar("DQL/td_error", td_error.mean(), self.step_num)
        self.Q_optimizer.step()

    def replay_experience(self, batch_size=None):
        batch_size = batch_size if batch_size is not None else self.params[
            'replay_batch_size']
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)
        self.training_steps_completed += 1

    def save(self, env_name):
        file_name = self.params['save_dir'] + "DQL_" + env_name + ".ptm"
        agent_state = {
            "Q": self.Q.state_dict(),
            "best_mean_reward": self.best_mean_reward,
            "best_reward": self.best_reward
        }
        torch.save(agent_state, file_name)
        print("Agent's state saved to ", file_name)

    def load(self, env_name):
        file_name = self.params['load_dir'] + "DQL_" + env_name + ".ptm"
        agent_state = torch.load(file_name,
                                 map_location=lambda storage, loc: storage)
        self.Q.load_state_dict(agent_state["Q"])
        self.Q.to(device)
        self.best_mean_reward = agent_state["best_mean_reward"]
        self.best_reward = agent_state["best_reward"]
        print("Loaded Q model state from", file_name,
              " which fetched a best mean reward of:", self.best_mean_reward,
              " and an all time best reward of:", self.best_reward)
Ejemplo n.º 16
0
class SwallowQLearner(object):
    def __init__(self, environment, learning_rate=0.005, gamma=0.98):
        self.obs_shape = environment.observation_space.shape

        self.action_shape = environment.action_space.n
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(self.Q.parameters(),
                                            lr=learning_rate)

        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q

        self.memory = ExperienceMemory(capacity=int(1e5))
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        if random.random() < self.epsilon_decay(self.step_num):
            action = random.choice([a for a in range(self.action_shape)])
        else:
            action = np.argmax(
                self.Q(obs).data.to(torch.device('cpu')).numpy())
        self.step_num += 1  ##EN EL VIDEO SE NOS OLVIDÓ SUBIR EL STEP EN UNA UNIDAD
        return action

    def learn(self, obs, action, reward, next_obs):
        td_target = reward + self.gamma * torch.max(self.Q(next_obs))
        td_error = torch.nn.functional.mse_loss(self.Q(obs)[action], td_target)
        self.Q_optimizer.zero_grad()
        td_error.backward()
        self.Q_optimizer.step()

    def replay_experience(self, batch_size):
        """
        Vuelve a jugar usando la experiencia aleatoria almacenada
        :param batch_size: Tamaño de la muestra a tomar de la memoria
        :return: 
        """
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)

    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()
        td_target = torch.from_numpy(td_target)
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))

        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step()
class SwallowQLearner(object):
    def __init__(
        self,
        environment,
        learning_rate=0.005,
        gamma=0.98
    ):  # metodo de inicializacion y self es una referencia del propio objecto
        self.obs_shape = environment.observation_space.shape  # me quedo los valores (tamaño, mas alto y mas bajo)

        self.action_shape = environment.action_space.n  # numero de acciones
        self.Q = SLP(self.obs_shape, self.action_shape)
        self.Q_optimizer = torch.optim.Adam(
            self.Q.parameters(), lr=learning_rate)  # lr = radio de aprendisaje

        self.gamma = gamma

        self.epsilon_max = 1.0
        self.epsilon_min = 0.05
        self.epsilon_decay = LinearDecaySchedule(
            initial_value=self.epsilon_max,
            final_value=self.epsilon_min,
            max_steps=0.5 * MAX_NUM_EPISODES * STEPS_PER_EPISODE)
        self.step_num = 0
        self.policy = self.epsilon_greedy_Q  # politica de actuacion

        self.memory = ExperienceMemory(capacity=int(1e6))
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def get_action(self, obs):
        return self.policy(obs)

    def epsilon_greedy_Q(self, obs):
        if random.random() < self.epsilon_decay(
                self.step_num):  # numero aleatorio
            action = random.choice([
                a for a in range(self.action_shape)
            ])  # del total de espcios aleactorios decido una
        else:
            action = np.argmax(
                self.Q(obs).data.to(torch.device('cpu')).numpy())
        return action

    def learn(self, obs, action, reward, next_obs):
        #discrete_obs = self.discretize(obs)
        #discrete_next_obs = self.discretize(next_obs)
        td_target = reward + self.gamma * torch.max(
            self.Q(next_obs))  # con torcho maximiza la toma de decion
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs)[action], td_target)  # Funcio de perdida de valores
        # RADIO DE APRENDIZAJE:
        self.Q_optimizer.zero_grad()
        td_error.backward()  # Haciendo el paso hacia atras
        self.Q_optimizer.step()  # obtimiza los pesos internos

    def replay_experience(self, batch_size):
        """
        Vuelve a jugar usando la experiencia aleatoria almacenada
        :param batch_size: Tamaño de la muestra a tomar de la memoria
        :return: 
        """
        experience_batch = self.memory.sample(batch_size)
        self.learn_from_batch_experience(experience_batch)

    def learn_from_batch_experience(self, experiences):
        """
        Actualiza la red neuronal profunda en base a lo aprendido en el conjunto de experiencias anteriores
        :param experiences: fragmento de recuerdos anteriores
        :return: 
        """
        batch_xp = Experience(*zip(*experiences))
        obs_batch = np.array(batch_xp.obs)
        action_batch = np.array(batch_xp.action)
        reward_batch = np.array(batch_xp.reward)
        next_obs_batch = np.array(batch_xp.next_obs)
        done_batch = np.array(batch_xp.done)

        td_target = reward_batch + ~done_batch * \
                    np.tile(self.gamma, len(next_obs_batch)) * \
                    self.Q(next_obs_batch).detach().max(1)[0].data.numpy()
        #           ~done_batch : Solo hara la suma si no a terminado
        td_target = torch.from_numpy(
            td_target)  # convertimos a un tensor para operar
        td_target = td_target.to(self.device)
        action_idx = torch.from_numpy(action_batch).to(self.device)
        td_error = torch.nn.functional.mse_loss(
            self.Q(obs_batch).gather(1,
                                     action_idx.view(-1, 1).long()),
            td_target.float().unsqueeze(1))
        self.Q_optimizer.zero_grad()
        td_error.mean().backward()
        self.Q_optimizer.step(
        )  # hago un paso adelante para que la neurona aprenda