Beispiel #1
0
class Agent:
    def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99,
                n_actions=2, max_size=1000000, tau=0.005, hd1=400, hd2=300, 
                batch_size=64, noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.memory = MemoryBuffer(max_size)
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = Actor(n_actions=n_actions)
        self.critic = Critic()
        self.target_actor = Actor(n_actions=n_actions)
        self.target_critic = Critic()

        self.actor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha))
        self.critic.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha))

        self.update_weights()

    def remember(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

    def train(self):
        cl, al = self.learn()
        if cl is not None:
            self.update_weights()

        return cl, al

    def update_weights(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_actor.weights
        for i, weight in enumerate(self.actor.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))
        self.target_actor.set_weights(weights)

        weights = []
        targets = self.target_critic.weights
        for i, weight in enumerate(self.critic.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))
        self.target_critic.set_weights(weights)

    def choose_action(self, observation, evaluate=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)

        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=self.noise)
        
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)

        return actions[0]

    # @tf.function
    def learn(self):
        if len(self.memory) < self.batch_size:
            return None, None

        states, actions, rewards, next_states, done = self.memory.sample(self.batch_size)

        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states)
            critic_value_ = tf.squeeze(self.target_critic(next_states, target_actions), 1)
            critic_value = tf.squeeze(self.critic(states, actions), 1)
            target = rewards + self.gamma * critic_value_ * (1 - done)
            critic_loss = tf.keras.losses.MSE(target, critic_value)

        critic_gradient = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_gradient, self.critic.trainable_variables)
        )

        with tf.GradientTape() as tape:
            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)
            
        actor_gradient = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_gradient, self.actor.trainable_variables)
        )

        self.update_weights()
        
Beispiel #2
0
class PPOAgent():
    def __init__(self,
                 rows,
                 columns,
                 num_actions,
                 l_rate=1e-4,
                 gamma=0.99,
                 lam=0.95,
                 policy_kl_range=0.0008,
                 policy_params=20,
                 value_clip=1.0,
                 loss_coefficient=1.0,
                 entropy_coefficient=0.05):
        self.rows = rows
        self.columns = columns
        self.num_actions = num_actions

        self.actor = Actor(self.num_actions)
        self.critic = Critic()
        self.actor_old = Actor(self.num_actions)
        self.critic_old = Critic()

        self.optimizer = tf.keras.optimizers.Adam(l_rate)

        self.gamma = gamma
        self.lam = lam
        self.policy_kl_range = policy_kl_range
        self.policy_params = policy_params
        self.value_clip = value_clip
        self.loss_coefficient = loss_coefficient
        self.entropy_coefficient = entropy_coefficient

    @tf.function
    def fit(self, states, actions, rewards, next_states, dones):
        with tf.GradientTape() as tape:
            action_probabilities, values = self.actor(states), self.critic(
                states)
            old_action_probabilities, old_values = self.actor_old(
                states), self.critic_old(states)
            next_values = self.critic(next_states)
            loss = self._get_loss(action_probabilities, values,
                                  old_action_probabilities, old_values,
                                  next_values, actions, rewards, dones)
        grads = tape.gradient(
            loss,
            self.actor.trainable_variables + self.critic.trainable_variables)
        self.optimizer.apply_gradients(
            zip(
                grads, self.actor.trainable_variables +
                self.critic.trainable_variables))

    def select_action(self, state, training=False):
        state_in = tf.expand_dims(state, axis=0)
        probabilities = self.actor(state_in)

        if training:
            distribution = tfp.distributions.Categorical(probs=probabilities)
            action = distribution.sample()
            action = int(action[0])
        else:
            action = tf.argmax(probabilities[0]).numpy()

        return action

    def get_model(self):
        return self.actor

    def update_networks(self):
        self.actor_old.set_weights(self.actor.get_weights())
        self.critic_old.set_weights(self.critic.get_weights())

    def save_model_weights(self, actor_filename, critic_filename):
        self.actor.save_weights(actor_filename)
        self.critic.save_weights(critic_filename)

    def load_model_weights(self, actor_filename, critic_filename=None):
        self.actor(np.zeros((1, self.rows, self.columns, 1)))
        self.actor.load_weights(actor_filename)
        self.actor_old(np.zeros((1, self.rows, self.columns, 1)))
        self.actor_old.load_weights(actor_filename)

        if critic_filename is not None:
            self.critic(np.zeros((1, self.rows, self.columns, 1)))
            self.critic.load_weights(critic_filename)
            self.critic_old(np.zeros((1, self.rows, self.columns, 1)))
            self.critic_old.load_weights(critic_filename)

    def save_optimizer_weights(self, filename):
        np.save(filename, self.optimizer.get_weights())

    def load_optimizer_weights(self, filename):
        optimizer_weights = np.load(filename, allow_pickle=True)
        model_weights = self.actor.trainable_weights + self.critic.trainable_variables
        zero_grads = [tf.zeros_like(w) for w in model_weights]
        self.optimizer.apply_gradients(zip(zero_grads, model_weights))
        self.optimizer.set_weights(optimizer_weights)

    def _get_loss(self, action_probabilities, values, old_action_probabilities,
                  old_values, next_values, actions, rewards, dones):
        old_values = tf.stop_gradient(old_values)

        advantages = self._generalized_advantages_estimation(
            values, rewards, next_values, dones)
        returns = tf.stop_gradient(advantages + values)
        advantages = \
            tf.stop_gradient((advantages - tf.math.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-7))

        log_probabilities = self._log_probabilities(action_probabilities,
                                                    actions)
        old_log_probabilities = tf.stop_gradient(
            self._log_probabilities(old_action_probabilities, actions))
        ratios = tf.math.exp(log_probabilities - old_log_probabilities)

        kl_divergence = self._kl_divergence(old_action_probabilities,
                                            action_probabilities)

        policy_gradient_loss = tf.where(
            tf.logical_and(kl_divergence >= self.policy_kl_range, ratios > 1),
            ratios * advantages - self.policy_params * kl_divergence,
            ratios * advantages)
        policy_gradient_loss = tf.math.reduce_mean(policy_gradient_loss)

        entropy = tf.math.reduce_mean(self._entropy(action_probabilities))

        clipped_values = old_values + tf.clip_by_value(
            values - old_values, -self.value_clip, self.value_clip)
        values_losses = tf.math.square(returns - values) * 0.5
        clipped_values_losses = tf.math.square(returns - clipped_values) * 0.5

        critic_loss = tf.math.reduce_mean(
            tf.math.maximum(values_losses, clipped_values_losses))
        loss = (critic_loss * self.loss_coefficient) - policy_gradient_loss - (
            entropy * self.entropy_coefficient)

        return loss

    def _generalized_advantages_estimation(self, values, rewards, next_values,
                                           dones):
        gae = 0
        advantages = []
        delta = rewards + (1.0 - dones) * self.gamma * next_values - values
        for i in reversed(range(len(rewards))):
            gae = delta[i] + (1.0 - dones[i]) * self.gamma * self.lam * gae
            advantages.insert(0, gae)

        return tf.stack(advantages)

    def _log_probabilities(self, action_probabilities, actions):
        distribution = tfp.distributions.Categorical(
            probs=action_probabilities)
        return tf.expand_dims(distribution.log_prob(actions), axis=1)

    def _kl_divergence(self, probabilities1, probabilities2):
        distribution1 = tfp.distributions.Categorical(probs=probabilities1)
        distribution2 = tfp.distributions.Categorical(probs=probabilities2)
        return tf.expand_dims(tfp.distributions.kl_divergence(
            distribution1, distribution2),
                              axis=1)

    def _entropy(self, probabilities):
        distribution = tfp.distributions.Categorical(probs=probabilities)
        return distribution.entropy()