Beispiel #1
0
class DQNAgent:

    def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05,epsilon_decay=1, epsilon_min=0.05,tau=1, game='cartpole',exploration="epsilon_greedy", history_length=0) :#, load_data=False):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q      
        self.Q_target = Q_target
        self.game = game

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.tau = tau
        self.epsilon_min = epsilon_min

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        self.exploration = exploration

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()


    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """
        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update: 
        #       2.1 compute td targets: 
        #              td_target =  reward + discount * max_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)
   
        '''
        self.replay_buffer.add_transition(state, action, next_state, reward, terminal)        
        states, actions, next_states, rewards, dones = self.replay_buffer.next_batch (self.batch_size)
 
        target_f = np.zeros((self.batch_size))

        for i in range(self.batch_size):
            if dones[i]:
                target_f[i] = rewards[i]
            else:
                target_f[i] = rewards[i] + self.discount_factor * np.max(self.Q_target.predict(self.sess, [next_states[i]]), 1)
                

        loss = self.Q.update(self.sess, states, actions, target_f)

        self.Q_target.update(self.sess)
        '''
        self.replay_buffer.add_transition(state, action, next_state, reward, terminal)
        batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch(self.batch_size)

        td_target =  batch_rewards

        best_action = np.argmax(self.Q.predict(self.sess, batch_next_state)[np.logical_not(batch_done)], 1)
	
        td_target[np.logical_not(batch_done)] += self.discount_factor * self.Q_target.predict(self.sess, batch_next_state)[np.logical_not(batch_done), best_action]

        loss = self.Q.update(self.sess, batch_state, batch_action, td_target)
        self.Q_target.update(self.sess)


        return loss


    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        '''
        r = np.random.uniform()

        if deterministic or r > self.epsilon:
            # TODO: take greedy action (argmax)
            # action_id = ...
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
        else:
        
            if self.game == 'cartpole':
                action_id = random.randrange(self.num_actions)
            elif self.game == 'carracing':

            # TODO: sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. 
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # action_id = ...
                
                probabilities = [0.1, 0.2, 0.2, 0.45, 0.05]
                
                action_id = np.random.choice (self.num_actions, p=probabilities)
        '''

        if deterministic:
            
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
        else:
            if self.exploration == "greedy":
                if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay
                r = np.random.uniform()
                if r > self.epsilon:
                    # TODO: take greedy action (argmax)
                    
                    action_id = np.argmax(self.Q.predict(self.sess, [state]))
                else:
                    # TODO: sample random action
                    # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. 
                    # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
                    # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing.
                    if self.game == "cartpole" :
                        action_id = np.random.randint(self.num_actions)
                    elif self.game == "carracing":
                        probabilities = [0.15, 0.15, 0.15, 0.3, 0.05, 0.1, 0.1]
                
                        action_id = np.random.choice (self.num_actions, p=probabilities)
                    else:
                        print("Invalid game")
            elif self.exploration == "boltzmann":
                action_value = self.Q.predict(self.sess, [state])[0]
                prob = self.softmax(action_value/self.tau)
                action_id = np.random.choice(self.num_actions, p=prob)
            else:
                print("Invalid Exploration Type")


        return action_id


    def softmax(self, input):
        """
        Safe Softmax function to avoid overflow
        Args:
            input: input vector
        Returns:
            prob: softmax of input
        """
        input_max = np.max(input)
        e = np.exp(input-input_max)
        prob = e / np.sum(e)
        return prob


    def load(self, file_name):
        self.saver.restore(self.sess, file_name)


    def check_early_stop(self, reward, totalreward):
        return self.Q_target.check_early_stop (reward, totalreward)
Beispiel #2
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 game,
                 exploration,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.2,
                 epsilon_decay=0.99,
                 epsilon_min=0.03):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        self.exploration = exploration

        self.game = game

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, done):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update:
        #       2.1 compute td targets:
        #              td_target =  reward + discount * max_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)

        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          done)
        batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch(
            self.batch_size)

        td_target = batch_rewards
        #td_target += self.discount_factor * np.amax(self.Q_target.predict(self.sess, batch_next_state)) #use this or think of something better

        best_action = np.amax(
            self.Q.predict(self.sess,
                           batch_next_state)[np.logical_not(batch_done)], 1)
        td_target[np.logical_not(
            batch_done)] += self.discount_factor * self.Q_target.predict(
                self.sess, batch_next_state)[np.logical_not(batch_done),
                                             best_action]

        self.Q.update(self.sess, batch_state, batch_action, td_target)
        self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()

        if deterministic:

            action_id = np.argmax(self.Q.predict(self.sess, [state]))

        else:

            if self.exploration == "greedy":

                if self.epsilon > self.epsilon_min:

                    self.epsilon *= self.epsilon_decay

                r = np.random.uniform()

                if r > self.epsilon:
                    # TODO: take greedy action (argmax)
                    action_id = np.argmax(self.Q.predict(self.sess, [state]))

                else:

                    # TODO: sample random action
                    # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
                    # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
                    # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
                    # action_id = ...

                    if self.game == "cartpole":
                        action_id = np.random.randint(
                            self.num_actions)  #define number of actions

                    #else if self.game == "CarRacing" :

                    #action_id = ....

                    else:
                        print('Please enter a valid game.')

    # if exploration == "boltzmann":

    #  else:

        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
Beispiel #3
0
class Agent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.95,
                 epsilon_min=0.05,
                 epsilon_decay=0.995,
                 exploration_type='e-annealing',
                 learning_type='dq',
                 replay_buffer_size=1e5):
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.exploration_type = exploration_type
        self.learning_type = learning_type

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(replay_buffer_size)

        # start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    # add transition to the replay buffer
    def add(self, state, action, next_state, reward, terminal):
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)

    # train network
    def train(self):
        # sample batch from the replay buffer
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)

        # compute td targets using q- or double q-learning
        if self.learning_type == 'q':  # q learning
            batch_rewards[np.logical_not(
                batch_dones)] += self.discount_factor * np.max(
                    self.Q_target.predict(self.sess, batch_next_states),
                    axis=1)[np.logical_not(batch_dones)]
        else:  # double q learning
            q_actions = np.argmax(self.Q.predict(self.sess, batch_next_states),
                                  axis=1)
            batch_rewards[np.logical_not(
                batch_dones)] += self.discount_factor * self.Q_target.predict(
                    self.sess,
                    batch_next_states)[np.arange(self.batch_size),
                                       q_actions][np.logical_not(batch_dones)]

        # update network and target network
        loss = self.Q.update(self.sess, batch_states, batch_actions,
                             batch_rewards)
        self.Q_target.update(self.sess)

        return loss

    # get action for state
    def act(self, state, deterministic):
        r = np.random.uniform()
        if deterministic or (self.exploration_type != 'boltzmann'
                             and r > self.epsilon):
            # take greedy action (argmax)
            a_pred = self.Q.predict(self.sess, [state])
            action_id = np.argmax(a_pred)
        else:
            if self.exploration_type == 'boltzmann':
                actions = self.Q.predict(self.sess, [state])[0]

                # softmax calculation, subtracting max for stability
                actions = np.exp((actions - max(actions)) / self.epsilon)
                actions /= np.sum(actions)

                # selecting action following probabilities
                a_value = np.random.choice(actions, p=actions)
                action_id = np.argmax(a_value == actions)
            else:
                # sample random action
                action_id = np.random.randint(0, self.num_actions)
        return action_id

    # anneal epsilon
    def anneal(self, e=0):
        self.epsilon = max(self.epsilon_min,
                           self.epsilon * self.epsilon_decay)  # linear
        #self.epsilon = max(self.epsilon_min, self.epsilon * np.exp(-(1 - self.epsilon_decay) * e))

    # load trained network
    def load(self, folder):
        self.saver.restore(self.sess, tf.train.latest_checkpoint(folder))
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.995,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.99

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        self.neg_reward_counter = 0
        self.max_neg_rewards = 100

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        # 2. sample next batch and perform batch update:
        #self.gas_actions = np.array([a == 3 for a in self.replay_buffer._data.actions])
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)

        td_target = batch_rewards
        td_target[np.logical_not(
            batch_dones)] += self.discount_factor * np.amax(
                self.Q_target.predict(self.sess, batch_next_states),
                1)[np.logical_not(batch_dones)]
        #print(batch_actions)
        loss = self.Q.update(self.sess, batch_states, batch_actions, td_target)

        self.Q_target.update(self.sess)

        #if self.epsilon > self.epsilon_min:
        #   self.epsilon *= self.epsilon_decay
        #print(self.epsilon)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            act_values = self.Q.predict(self.sess, [state])
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
            #print("I PREDICTED")
            #print("action_id_predicted: ", action_id)
            return action_id
        else:
            action_id = np.random.choice(
                [0, 1, 2, 3, 4],
                p=[0.3, 0.1, 0.1, 0.49,
                   0.01])  #straight, left, right, accelerate, brake
            # TODO: sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # print("action_id: ", action_id)
            #print("action_id_random: ", action_id)
            return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
Beispiel #5
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05,
                 act_probabilities=None,
                 double_q=False,
                 buffer_capacity=100000,
                 prefill_bs_percentage=5):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer(capacity=buffer_capacity,
                                          min_fill=prefill_bs_percentage *
                                          batch_size)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

        # <JAB>
        if act_probabilities is None:
            self.act_probabilities = np.ones(num_actions) / num_actions
        else:
            self.act_probabilities = act_probabilities

        self.double_dqn = double_q

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update:
        #       2.1 compute td targets:
        #              td_target =  reward + discount * argmax_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)

        # <JAB>
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)

        # Let the buffer fill up, otherwise we will burn up a lot of $#!+¥ states early on
        if self.replay_buffer.has_min_items():
            buffer = self.replay_buffer.next_batch(self.batch_size)
            batch_states = buffer[0]
            batch_actions = buffer[1]
            batch_next_states = buffer[2]
            batch_rewards = buffer[3]
            batch_dones = buffer[4]

            non_terminal_states = np.logical_not(batch_dones)

            if self.double_dqn:
                a_predictions = self.Q.predict(self.sess, batch_next_states)
                a_predictions = np.argmax(a_predictions, axis=1)
                action_indexes = [np.arange(len(a_predictions)), a_predictions]
                q_predictions = self.Q_target.predict(self.sess,
                                                      batch_next_states)
                q_predictions = q_predictions[action_indexes]

            else:
                q_predictions = self.Q_target.predict(self.sess,
                                                      batch_next_states)
                q_predictions = np.max(q_predictions, axis=1)

            td_target = batch_rewards
            # If episode is not finished, add predicted Q values to the current rewards
            td_target[
                non_terminal_states] += self.discount_factor * q_predictions[
                    non_terminal_states]

            # Update Step
            self.Q.update(self.sess, batch_states, batch_actions, td_target)
            self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            # <JAB>
            action_id = np.argmax(self.Q.predict(self.sess, state))
            # </JAB>

        else:

            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # action_id = ...
            # <JAB>
            action_id = np.random.choice(np.arange(self.num_actions),
                                         p=self.act_probabilities)
            # </JAB>

        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
Beispiel #6
0
class DQNAgent:
    def __init__(self,
                 name,
                 Q_current,
                 Q_target,
                 num_actions,
                 discount_factor,
                 batch_size,
                 epsilon,
                 epsilon_decay,
                 boltzmann,
                 double_q,
                 buffer_capacity,
                 random_probs=None):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        # save hyperparameters in folder

        self.name = name  # probably useless
        self.Q_current = Q_current
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.boltzmann = boltzmann

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        self.buffer_capacity = buffer_capacity

        self.double_q = double_q

        self.random_probs = random_probs

        # define replay buffer
        self.replay_buffer = ReplayBuffer(capacity=buffer_capacity)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)

        # 2. sample next batch
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)

        # find optimal actions for the sampled s' states
        if self.double_q:
            # double Q learning (select actions using current network, rather than target network)
            # ...in order to decorrelate noise between selection and evaluation
            # (Q(state,action) is still evaluated using target network in any case)
            action_selector = self.Q_current
        else:
            action_selector = self.Q_target

        # as usual, the Q network returns a vector of... predicted values for every possible action
        a_prime = np.argmax(action_selector.predict(self.sess,
                                                    batch_next_states),
                            axis=1)

        # pick a''th value from each column of the Q prediction
        # note, this will include action predictions for "done" state, but we'll kill them later
        q_values_next = self.Q_current.predict(
            self.sess, batch_next_states)[np.arange(self.batch_size), a_prime]

        # 2.1 compute td targets:
        # if done, there will be no next state
        td_targets = batch_rewards + np.where(
            batch_dones, 0, self.discount_factor * q_values_next)

        # 2.2 update the Q (current) network
        self.Q_current.update(self.sess, batch_states, batch_actions,
                              td_targets)

        # 2.3 call soft update for target network
        # this is done by the dodgy associate_method therein
        self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """

        # get action probabilities from current network
        Q_values = np.squeeze(
            self.Q_current.predict(self.sess, np.expand_dims(state, axis=0)))

        argmax_a = np.argmax(Q_values)

        if deterministic:
            # take greedy action
            return argmax_a

        if self.boltzmann:
            # implementing an interaction here between boltzmann exploration and epsilon:
            # viz. that epsilon controls the temperature of the softmax function
            # so that as before, higher eps -> higher exploration
            action_probs = softmax(Q_values,
                                   temperature=1 / (1 - self.epsilon)**2)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)

        else:
            action_probs = np.zeros_like(Q_values)

            if np.random.uniform() > self.epsilon:
                # choose the best action
                action = argmax_a
            else:
                # explore
                if self.random_probs is None:
                    action = np.random.randint(self.num_actions, size=1)[0]

                else:
                    action = np.random.choice(np.arange(self.num_actions),
                                              p=self.random_probs)

        # we decay epsilon AFTER we've checked it
        # (nb: if deterministic, epsilon will never decay, but of course this doesn't matter)
        if self.epsilon_decay > 0:
            self.epsilon *= (1 - self.epsilon_decay)

        return action

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
Beispiel #7
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.95,
                 batch_size=64,
                 epsilon=1):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        # 2. sample next batch and perform batch update:
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)
        for i in range(self.batch_size):
            # print("next state: ", batch_next_states[i])
            td_target = batch_rewards[i]
            if not batch_dones[i]:
                td_target = batch_rewards[i] + self.discount_factor * np.amax(
                    self.Q_target.predict(self.sess, [batch_next_states[i]]))
            target_f = self.Q_target.predict(self.sess, [batch_states[i]])

            target_f[0][batch_actions[i]] = td_target
            loss = self.Q.update(self.sess, [batch_states[i]],
                                 [batch_actions[i]], target_f[0])  #td_targets)
            self.Q_target.update(self.sess)
        #print("loss:", loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        #print("epsilon: ", self.epsilon)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            # TODO: take greedy action (argmax)
            #state = np.reshape(state, (1,4))
            act_values = self.Q.predict(self.sess, [state])  #it was q target
            # we should be using act_values[0], i guess
            # print("act values: ", act_values)             # act values:  [[0.05641035 0.06138265]]
            # print("act values[0]: ", act_values[0])       # act values[0]:  [0.05641035 0.06138265]

            action_id = np.argmax(act_values[0])

            #print("predicted action. deterministic: {}. epsilon cond: {}. action_id: {}."
            #.format(deterministic, (r > self.epsilon), action_id))
        else:
            action_id = random.randrange(self.num_actions)
            #print("random action. deterministic: {}. epsilon cond.: {}. action_id: {}."
            #.format(deterministic, (r > self.epsilon), action_id))
            # TODO: sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
        # print("action_id: ", action_id)
        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
Beispiel #8
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 game="cartpole",
                 explore_type="epsilon_greedy",
                 epsilon_decay=1,
                 epsilon_min=0.05,
                 tau=1,
                 method="CQL",
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target
        # now support cartpole or carracing two games
        self.game = game
        # self.state_dim = Q.
        self.epsilon = epsilon
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor
        # now support CQL(classical Q) or DQL(Double Q)
        self.method = method
        self.explore_type = explore_type
        # for epsilon annealing
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        # for boltzmann exploration
        self.tau = tau
        # define replay buffer
        self.replay_buffer = ReplayBuffer()

        # start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self, state, action, next_state, reward, terminal):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # TODO:
        # 1. add current transition to replay buffer
        # 2. sample next batch and perform batch update:
        #       2.1 compute td targets:
        #              td_target =  reward + discount * argmax_a Q_target(next_state_batch, a)
        #       2.2 update the Q network
        #              self.Q.update(...)
        #       2.3 call soft update for target network
        #              self.Q_target.update(...)
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch(
            self.batch_size)
        td_target = batch_rewards
        if self.method == "CQL":
            td_target[np.logical_not(
                batch_dones)] += self.discount_factor * np.max(
                    self.Q_target.predict(self.sess, batch_next_states),
                    1)[np.logical_not(batch_dones)]
            self.Q.update(self.sess, batch_states, batch_actions, td_target)
            self.Q_target.update(self.sess)
        elif self.method == "DQL":
            best_action = np.argmax(
                self.Q.predict(self.sess,
                               batch_next_states)[np.logical_not(batch_dones)],
                1)
            td_target[np.logical_not(
                batch_dones)] += self.discount_factor * self.Q_target.predict(
                    self.sess, batch_next_states)[np.logical_not(batch_dones),
                                                  best_action]
            self.Q.update(self.sess, batch_states, batch_actions, td_target)
            self.Q_target.update(self.sess)

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)    
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        if deterministic:
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
        else:
            if self.explore_type == "epsilon_greedy":
                if self.epsilon > self.epsilon_min:
                    self.epsilon *= self.epsilon_decay
                r = np.random.uniform()
                if r > self.epsilon:
                    # TODO: take greedy action (argmax)
                    action_id = np.argmax(self.Q.predict(self.sess, [state]))
                else:
                    # TODO: sample random action
                    # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
                    # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
                    # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing.
                    if self.game == "cartpole" or self.game == "mountaincar":
                        action_id = np.random.randint(self.num_actions)
                    elif self.game == "carracing":
                        # action_probability = np.array([1, 2, 2, 10, 1, 1, 1])
                        action_probability = np.array([2, 5, 5, 10, 1])
                        action_probability = action_probability / np.sum(
                            action_probability)
                        action_id = np.random.choice(self.num_actions,
                                                     p=action_probability)
                    else:
                        print("Invalid game")
            elif self.explore_type == "boltzmann":
                action_value = self.Q.predict(self.sess, [state])[0]
                prob = self.softmax(action_value / self.tau)
                action_id = np.random.choice(self.num_actions, p=prob)
            else:
                print("Invalid Exploration Type")
        return action_id

    def softmax(self, input):
        """
        Safe Softmax function to avoid overflow
        Args:
            input: input vector
        Returns:
            prob: softmax of input
        """
        input_max = np.max(input)
        e = np.exp(input - input_max)
        prob = e / np.sum(e)
        return prob

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)
Beispiel #9
0
class DQNAgent:
    def __init__(self,
                 Q,
                 Q_target,
                 num_actions,
                 discount_factor=0.99,
                 batch_size=64,
                 epsilon=0.05):
        """
         Q-Learning agent for off-policy TD control using Function Approximation.
         ########################################################################
         TD here for using as new target R + discount_factor * Q(S', A')
         off-policy -> use old data collected on other policy, too
         #######################################################################
         Finds the optimal greedy policy while following an epsilon-greedy policy.

         Args:
            Q: Action-Value function estimator (Neural Network)
            Q_target: Slowly updated target network to calculate the targets.
            num_actions: Number of actions of the environment.
            discount_factor: gamma, discount factor of future rewards.
            batch_size: Number of samples per batch.
            epsilon: Chance to sample a random action. Float betwen 0 and 1.
        """
        self.Q = Q
        self.Q_target = Q_target

        self.epsilon = epsilon

        self.num_actions = num_actions
        self.batch_size = batch_size
        self.discount_factor = discount_factor

        # define replay buffer
        self.replay_buffer = ReplayBuffer(use_manual_data=False)

        # Start tensorflow session
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()

    def train(self,
              state,
              action,
              next_state,
              reward,
              terminal,
              collect_data_first=False):
        """
        This method stores a transition to the replay buffer and updates the Q networks.
        """

        # add current transition to replay buffer
        self.replay_buffer.add_transition(state, action, next_state, reward,
                                          terminal)
        # if the ReplayBuffer should be filled up first, then the train step is done here
        if collect_data_first and len(
                self.replay_buffer._data.states) < self.batch_size:
            print("No training yet. Filling up replay buffer..")

            # return 0 for loss and q_values
            return 0, [0, 0]

        # If the ReplayBuffer should not be filled up or is full enough, do the following
        else:
            # get a random batch from the ReplayBuffer
            batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \
                self.replay_buffer.next_batch(self.batch_size)

            batch_targets = np.zeros((self.batch_size))

            for i in range(self.batch_size):
                # if a state is a final state, only use the direct reward
                if batch_dones[i]:
                    batch_targets[i] = batch_rewards[i]
                # otherwise comput the td_target
                else:
                    td_target = batch_rewards[i] + self.discount_factor * \
                        np.max(self.Q_target.predict(self.sess, [batch_next_states[i]]))
                    batch_targets[i] = td_target

            # update Q network
            loss = self.Q.update(self.sess, batch_states, batch_actions,
                                 batch_targets)
            # get predictions to check q-values -> e.g. are they diverging?
            q_preds = self.Q.predict(self.sess, batch_states)

            # update target network
            self.Q_target.update(self.sess)

        return loss, q_preds

    def act(self, state, deterministic):
        """
        This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action)
        Args:
            state: current state input
            deterministic:  if True, the agent should execute the argmax action (False in training, True in evaluation)
        Returns:
            action id
        """
        r = np.random.uniform()
        if deterministic or r > self.epsilon:
            # take greedy action (argmax)
            action_id = np.argmax(self.Q.predict(self.sess, [state]))
            # print("Deterministic action:", action_id)
        else:

            # sample random action
            # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work.
            # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight.
            # To see how the agent explores, turn the rendering in the training on and look what the agent is doing.
            # for carracing:
            if self.num_actions == 5:
                action_id = np.random.choice(range(5),
                                             p=[0.32, 0.09, 0.09, 0.4, 0.1])
            # for cartpole
            action_id = np.random.randint(self.num_actions)
            # print("Explorative action:", action_id)

        return action_id

    def load(self, file_name):
        self.saver.restore(self.sess, file_name)