class DDPGAgent(Agent):
    ''' stevenpjg's implementation of DDPG algorithm '''

    REPLAY_MEMORY_SIZE = 10000
    BATCH_SIZE = 64
    GAMMA = 0.99

    def __init__(self,
                 env,
                 dir,
                 is_batch_norm=False,
                 is_grad_inverter=True,
                 training_flag=True):
        super().__init__(env, dir)
        assert isinstance(env.action_space,
                          Box), "action space must be continuous"

        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.observation_space_size,
                                           self.action_space_size)
            self.actor_net = ActorNet_bn(self.observation_space_size,
                                         self.action_space_size)

        else:
            self.critic_net = CriticNet(self.observation_space_size,
                                        self.action_space_size)
            self.actor_net = ActorNet(self.observation_space_size,
                                      self.action_space_size)

        self.is_grad_inverter = is_grad_inverter
        self.training_flag = training_flag
        self.replay_memory = deque()

        self.time_step = 0

        action_max = np.array(self.high).tolist()
        action_min = np.array(self.low).tolist()
        action_bounds = [action_max, action_min]
        self.grad_inv = grad_inverter(action_bounds)

        self.data_fetch = None

    def add_data_fetch(self, df):
        self.data_fetch = df

    def get_short_name(self):
        return 'DDPG'

    def act(self, state):
        state = self._np_shaping(state, True)
        result = self.actor_net.evaluate_actor(state).astype(float)

        if self.data_fetch:
            self.data_fetch.set_actors_action(result[0].tolist())

        return result

    def observe(self, episode):
        episode['obs'] = self._np_shaping(episode['obs'], True)
        episode['action'] = self._np_shaping(episode['action'], False)
        episode['obs2'] = self._np_shaping(episode['obs2'], True)

        self.add_experience(episode)

    def add_experience(self, episode):
        self.replay_memory.append(episode)

        self.time_step += 1
        if len(self.replay_memory) > type(self).REPLAY_MEMORY_SIZE:
            self.replay_memory.popleft()

        if len(self.replay_memory) > type(self).BATCH_SIZE:
            res = self.train()
            return res
        else:
            return None

    def minibatches(self):
        batch = random.sample(self.replay_memory, type(self).BATCH_SIZE)
        # state t
        state = self._np_shaping(np.array([item['obs'] for item in batch]),
                                 True)
        # action
        action = self._np_shaping(np.array([item['action'] for item in batch]),
                                  False)
        # reward
        reward = np.array([item['reward'] for item in batch])
        # state t+1
        state_2 = self._np_shaping(np.array([item['obs2'] for item in batch]),
                                   True)
        # doneA
        done = np.array([item['done'] for item in batch])

        return state, action, reward, state_2, done

    def train(self):
        if not self.training_flag:
            return

        # sample a random minibatch of N transitions from R
        state, action, reward, state_2, done = self.minibatches()

        actual_batch_size = len(state)

        target_action = self.actor_net.evaluate_target_actor(state)

        # Q'(s_i+1,a_i+1)
        q_t = self.critic_net.evaluate_target_critic(state_2, target_action)

        y = []
        for i in range(0, actual_batch_size):

            if done[i]:
                y.append(reward[i])
            else:
                y.append(reward[i] +
                         type(self).GAMMA * q_t[i][0])  # q_t+1 instead of q_t

        y = np.reshape(np.array(y), [len(y), 1])

        # Update critic by minimizing the loss
        self.critic_net.train_critic(state, action, y)

        # Update actor proportional to the gradients:
        # action_for_delQ = self.act(state)  # was self.evaluate_actor instead of self.act
        action_for_delQ = self.actor_net.evaluate_actor(
            state)  # dont need wolp action

        if self.is_grad_inverter:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)  # /BATCH_SIZE
            del_Q_a = self.grad_inv.invert(del_Q_a, action_for_delQ)
        else:
            del_Q_a = self.critic_net.compute_delQ_a(
                state, action_for_delQ)[0]  # /BATCH_SIZE

        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(state, del_Q_a)

        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()

    def save_agent(self, force=False, comment="default"):
        path = "{}/weights/{}".format(self.get_dir(), comment)
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)
            print("Saving agent in", path)
            self.actor_net.save_model(path + '/actor.ckpt')
            self.critic_net.save_model(path + '/critic.ckpt')
        else:
            if force:
                print("Overwrite old agent in", path)
                self.actor_net.save_model(path + '/actor.ckpt')
                self.critic_net.save_model(path + '/critic.ckpt')
            else:
                print("Save aborted. An agent is already saved in ", path)

    def load_agent(self, agent_name=None, comment="default"):
        if agent_name is None:
            path = "{}/weights/{}".format(self.get_dir(), comment)
        else:
            path = "{}/{}/{}/weights/{}".format(self.result_dir, agent_name,
                                                self.env.spec.id, comment)
        if os.path.exists(path):
            print("Loading agent saved in", path)
            self.actor_net.load_model(path + '/actor.ckpt')
            self.critic_net.load_model(path + '/critic.ckpt')
        else:
            print("Agent not found in", path)

    def close_session(self):
        self.actor_net.close()
        self.critic_net.close()
예제 #2
0
class DDPG:
    
    """ Deep Deterministic Policy Gradient Algorithm"""
    def __init__(self,env, is_batch_norm=False):
        self.env = env 
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]
        
        
        if is_batch_norm:
            self.critic_net = CriticNet_bn(self.num_states, self.num_actions) 
            self.actor_net = ActorNet_bn(self.num_states, self.num_actions)
            
        else:
            self.critic_net = CriticNet(self.num_states, self.num_actions) 
            self.actor_net = ActorNet(self.num_states, self.num_actions)
        
        #Initialize Buffer Network:
        self.replay_memory = deque()
        
        #Intialize time step:
        self.time_step = 0
        self.counter = 0
        
        action_max = np.array(env.action_space.high).tolist()
        action_min = np.array(env.action_space.low).tolist()        
        action_bounds = [action_max,action_min] 
        self.grad_inv = grad_inverter(action_bounds)
        
    def load_model(self):
        try:
            self.critic_net.load_model()
            self.actor_net.load_model()
        except Exception as ex:
            print(ex)

    def save_model(self):    
        self.critic_net.save_model()
        self.actor_net.save_model()
        

    def evaluate_actor(self, state_t):
        return self.actor_net.evaluate_actor(state_t)
    
    def add_experience(self, observation_1, observation_2, action, reward, done):
        self.observation_1 = observation_1
        self.observation_2 = observation_2
        self.action = action
        self.reward = reward
        self.done = done
        self.replay_memory.append((self.observation_1, self.observation_2, self.action, self.reward,self.done))
        self.time_step = self.time_step + 1
        if(len(self.replay_memory)>REPLAY_MEMORY_SIZE):
            self.replay_memory.popleft()
            
        
    def minibatches(self):
        batch = random.sample(self.replay_memory, BATCH_SIZE)
        #state t
        self.state_t_batch = [item[0] for item in batch]
        self.state_t_batch = np.array(self.state_t_batch)
        #state t+1        
        self.state_t_1_batch = [item[1] for item in batch]
        self.state_t_1_batch = np.array( self.state_t_1_batch)
        self.action_batch = [item[2] for item in batch]
        self.action_batch = np.array(self.action_batch)
        self.action_batch = np.reshape(self.action_batch,[len(self.action_batch),self.num_actions])
        self.reward_batch = [item[3] for item in batch]
        self.reward_batch = np.array(self.reward_batch)
        self.done_batch = [item[4] for item in batch]
        self.done_batch = np.array(self.done_batch)  
                  
                 
    def train(self):
        #sample a random minibatch of N transitions from R
        self.minibatches()
        self.action_t_1_batch = self.actor_net.evaluate_target_actor(self.state_t_1_batch)
        #Q'(s_i+1,a_i+1)        
        q_t_1 = self.critic_net.evaluate_target_critic(self.state_t_1_batch,self.action_t_1_batch) 
        self.y_i_batch=[]         
        for i in range(0,BATCH_SIZE):
                           
            if self.done_batch[i]:
                self.y_i_batch.append(self.reward_batch[i])
            else:
                
                self.y_i_batch.append(self.reward_batch[i] + GAMMA*q_t_1[i][0])                 
        
        self.y_i_batch=np.array(self.y_i_batch)
        self.y_i_batch = np.reshape(self.y_i_batch,[len(self.y_i_batch),1])
        
        # Update critic by minimizing the loss
        self.critic_net.train_critic(self.state_t_batch, self.action_batch,self.y_i_batch)
        
        # Update actor proportional to the gradients:
        action_for_delQ = self.evaluate_actor(self.state_t_batch) 
        
        if is_grad_inverter:        
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)#/BATCH_SIZE            
            self.del_Q_a = self.grad_inv.invert(self.del_Q_a,action_for_delQ) 
        else:
            self.del_Q_a = self.critic_net.compute_delQ_a(self.state_t_batch,action_for_delQ)[0]#/BATCH_SIZE
        
        # train actor network proportional to delQ/dela and del_Actor_model/del_actor_parameters:
        self.actor_net.train_actor(self.state_t_batch,self.del_Q_a)
 
        # Update target Critic and actor network
        self.critic_net.update_target_critic()
        self.actor_net.update_target_actor()