예제 #1
0
class PPOAgent:
    def __init__(self, state_dim, action_dim, env, epsilon, mujoco):
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.env = env
        self.env_name = self.env.unwrapped.spec.id
        self.isMujoco = mujoco
        
        self.actor_model = self.init_actor_network()
        self.critic_model = self.init_critic_network()
        self.actor_old_model = self.init_actor_network()
        self.critic_old_model = self.init_critic_network()
		      
        self.policy_clip = 0.2 
        self.value_clip = 0.2    
        self.entropy_coef = 0.0
        self.vf_loss_coef = 0.5
        self.minibatch = 25        
        self.ppo_epochs = 10
        self.action_std = 1.0
        self.epsilon = epsilon
        
        self.cov_mat = tf.fill((action_dim,), self.action_std * self.action_std)
        self.optimizer = Adam(learning_rate = 3e-4)
        self.buffer = Buffer()

        self.actor_loss_metric = tf.keras.metrics.Mean(name='actor_loss_metric')
        self.critic_loss_metric = tf.keras.metrics.Mean(name='critic_loss_metric')
        self.entropy_metric = tf.keras.metrics.Mean(name='entropy_metric')
        self.advantages_metric = tf.keras.metrics.Mean(name='advanteges_metric')
        self.returns_metric = tf.keras.metrics.Mean(name='returns_metric')
        
        # tensorboard --logdir logs/gradient_tape
        self.train_log_dir = 'logs/gradient_tape/'+ self.env_name + utils.get_time_date() + '/train'
        self.train_summary_writer = tf.summary.create_file_writer(self.train_log_dir)
    
    def set_epsilon(self,epsilon):
        self.epsilon = epsilon  

    def get_epsilon(self):
        return self.epsilon  

    def get_summary_writer(self):
        return self.train_summary_writer  

    def init_actor_network(self):
        actor_model = tf.keras.Sequential()
        if self.isMujoco:
            actor_model.add(Dense(512, input_shape=self.state_dim, activation='relu', kernel_initializer=RandomUniform(seed=69)))
            actor_model.add(Dense(256, activation='relu', kernel_initializer=RandomUniform(seed=69)))
            actor_model.add(Dense(256, activation='relu', kernel_initializer=RandomUniform(seed=69)))
            actor_model.add(Dense(128, activation='relu', kernel_initializer=RandomUniform(seed=69)))
        else:
            actor_model.add(Dense(128, input_shape=self.state_dim, activation='relu', kernel_initializer=RandomUniform(seed=69)))
        actor_model.add(Dense(64, activation='relu', kernel_initializer=RandomUniform(seed=69)))
        actor_model.add(Dense(self.action_dim, activation = 'tanh', kernel_initializer=RandomUniform(seed=69)))
        return actor_model
    
    def init_critic_network(self):
        critic_model = tf.keras.Sequential()
        if self.isMujoco:
            critic_model.add(Dense(512, input_shape=self.state_dim, activation='relu', kernel_initializer=RandomUniform(seed=69)))
            critic_model.add(Dense(256, activation='relu', kernel_initializer=RandomUniform(seed=69)))
            critic_model.add(Dense(256, activation='relu', kernel_initializer=RandomUniform(seed=69)))
            critic_model.add(Dense(128, activation='relu', kernel_initializer=RandomUniform(seed=69)))
        else:
            critic_model.add(Dense(128, input_shape=self.state_dim, activation='relu', kernel_initializer=RandomUniform(seed=69)))
        critic_model.add(Dense(64, activation='relu', kernel_initializer=RandomUniform(seed=69)))
        critic_model.add(Dense(1, activation = 'linear', kernel_initializer=RandomUniform(seed=69)))
        return critic_model

    @tf.function
    def get_action(self, state):        
        state = tf.expand_dims(tf.cast(state, dtype = tf.float32), 0)        
        action_mean = self.actor_model(state)
        action = utils.normal_sample(action_mean, self.cov_mat)
        return tf.squeeze(action)   

    def save_eps(self, state, reward, action, done, next_state):
        self.buffer.save_eps(state, reward, action, done, next_state)

    def get_ppo_loss(self, states, actions, rewards, dones, next_states):        
        action_mean, values  = self.actor_model(states), self.critic_model(states)
        old_action_mean, old_values = self.actor_old_model(states), self.critic_old_model(states)
        next_values  = self.critic_model(next_states)  
                
        # Block the contribution of the old value in backpropagation
        old_values = tf.stop_gradient(old_values)

        # Calculate entropy of the action probability 
        dist_entropy = tf.math.reduce_mean(utils.entropy(action_mean, self.cov_mat))

        # Calculate the ratio (pi_theta / pi_theta__old)        
        logprobs = tf.expand_dims(utils.logprob(action_mean, self.cov_mat, actions), 1)         
        old_logprobs = tf.stop_gradient(tf.expand_dims(utils.logprob(old_action_mean, self.cov_mat, actions), 1))
		
        # Calculate external GAE
        advantages = tf.stop_gradient(utils.generalized_advantage_estimation(values, rewards, next_values, dones))
        returns = tf.stop_gradient(utils.temporal_difference(rewards, next_values, dones))
                        
        # Calculate external critic loss through clipped critic value
        vpredclipped = old_values + tf.clip_by_value(values - old_values, -self.value_clip, self.value_clip) # Minimize the difference between old value and new value
        vf_losses1 = tf.math.square(returns - values) # Mean Squared Error
        vf_losses2 = tf.math.square(returns - vpredclipped) # Mean Squared Error        
        critic_loss = tf.math.reduce_mean(tf.math.maximum(vf_losses1, vf_losses2)) * 0.5        
        
        # Calculate Surrogate Loss
        ratios = tf.math.exp(logprobs - old_logprobs) # ratios = old_logprobs / logprobs        
        surr1 = ratios * advantages
        surr2 = tf.clip_by_value(ratios, 1 - self.policy_clip, 1 + self.policy_clip) * advantages
        pg_loss = tf.math.reduce_mean(tf.math.minimum(surr1, surr2))         
                        
        loss = (critic_loss * self.vf_loss_coef) - (dist_entropy * self.entropy_coef) - pg_loss
        return loss, critic_loss, dist_entropy, advantages, returns
    
    # Get loss and perform backpropagation
    @tf.function
    def train_ppo(self, states, actions, rewards, dones, next_states):     
        with tf.GradientTape() as ppo_tape:
            loss, critic_loss, dist_entropy, advantages, returns = self.get_ppo_loss(states, actions, rewards, dones, next_states)
                    
        gradients = ppo_tape.gradient(loss, self.actor_model.trainable_variables + self.critic_model.trainable_variables)        

        self.optimizer.apply_gradients(zip(gradients, self.actor_model.trainable_variables + self.critic_model.trainable_variables)) 
    
        self.actor_loss_metric(loss)
        self.critic_loss_metric(critic_loss)
        self.entropy_metric(dist_entropy)
        self.advantages_metric(advantages)
        self.returns_metric(returns)
    
    # Update the model
    def update_ppo(self):        
        batch_size = int(self.buffer.length() / self.minibatch)
        
        # Optimize policy using K epochs:
        for epoch in range(self.ppo_epochs):       
            for states, actions, rewards, dones, next_states in self.buffer.get_all().batch(batch_size):
                self.train_ppo(states, actions, rewards, dones, next_states)

            with self.train_summary_writer.as_default():
                tf.summary.scalar('actor_loss', self.actor_loss_metric.result(), step=epoch)
                tf.summary.scalar('critic_loss', self.critic_loss_metric.result(), step=epoch)
                tf.summary.scalar('entropy', self.entropy_metric.result(), step=epoch)
                tf.summary.scalar('advantages', self.advantages_metric.result(), step=epoch)
                tf.summary.scalar('returns', self.returns_metric.result(), step=epoch)

        self.buffer.clean_buffer()

        self.actor_loss_metric.reset_states()
        self.critic_loss_metric.reset_states()
        self.entropy_metric.reset_states()
        self.advantages_metric.reset_states()
        self.returns_metric.reset_states()
                
        # Copy new weights into old policy:
        self.actor_old_model.set_weights(self.actor_model.get_weights())
        self.critic_old_model.set_weights(self.critic_model.get_weights())

    def save_models(self,episode,identifier):
        time = utils.get_time_date()
        if os.path.exists(self.env_name+'/models') is False:
            os.mkdir(self.env_name+'/models')
        print('Saving models as -{}- {}'.format(identifier,time))
        self.actor_model.save(self.env_name+'/models/actor_model_'+str(episode)+identifier+time+'.h5')
        self.actor_old_model.save(self.env_name+'/models/actor_old_model_'+str(episode)+identifier+time+'.h5')
        self.critic_model.save(self.env_name+'/models/critic_model_'+str(episode)+identifier+time+'.h5')
        self.critic_old_model.save(self.env_name+'/models/critic_old_model_'+str(episode)+identifier+time+'.h5')

    def load_models(self,path):
        if os.path.exists(path+'/actor_model.h5') is True and os.path.exists(path+'/critic_model.h5') is True:
            print('Loading models . . . ')
            self.actor_model = tf.keras.models.load_model(path+'/actor_model.h5')
            self.critic_model = tf.keras.models.load_model(path+'/critic_model.h5')
        else:
            print('Models not found . . . ')
            exit()