Esempio n. 1
0
class Agent:
    def __init__(self,
                 n_actions,
                 input_dims,
                 gamma=0.99,
                 alpha=0.0003,
                 gae_lambda=0.95,
                 policy_clip=0.2,
                 batch_size=64,
                 n_epochs=10,
                 chkpt_dir='models/'):
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda
        self.chkpt_dir = chkpt_dir

        self.actor = ActorNetwork(n_actions)
        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic = CriticNetwork()
        self.critic.compile(optimizer=Adam(learning_rate=alpha))
        self.memory = PPOMemory(batch_size)

    def store_transition(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save(self.chkpt_dir + 'actor')
        self.critic.save(self.chkpt_dir + 'critic')

    def load_models(self):
        print('... loading models ...')
        self.actor = keras.models.load_model(self.chkpt_dir + 'actor')
        self.critic = keras.models.load_model(self.chkpt_dir + 'critic')

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])

        probs = self.actor(state)
        dist = tfp.distributions.Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        value = self.critic(state)

        action = action.numpy()[0]
        value = value.numpy()[0]
        log_prob = log_prob.numpy()[0]

        return action, log_prob, value

    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
                reward_arr, dones_arr, batches = \
                self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr) - 1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr) - 1):
                    a_t += discount * (reward_arr[k] +
                                       self.gamma * values[k + 1] *
                                       (1 - int(dones_arr[k])) - values[k])
                    discount *= self.gamma * self.gae_lambda
                advantage[t] = a_t

            for batch in batches:
                with tf.GradientTape(persistent=True) as tape:
                    states = tf.convert_to_tensor(state_arr[batch])
                    old_probs = tf.convert_to_tensor(old_prob_arr[batch])
                    actions = tf.convert_to_tensor(action_arr[batch])

                    probs = self.actor(states)
                    dist = tfp.distributions.Categorical(probs)
                    new_probs = dist.log_prob(actions)

                    critic_value = self.critic(states)

                    critic_value = tf.squeeze(critic_value, 1)

                    prob_ratio = tf.math.exp(new_probs - old_probs)
                    weighted_probs = advantage[batch] * prob_ratio
                    clipped_probs = tf.clip_by_value(prob_ratio,
                                                     1 - self.policy_clip,
                                                     1 + self.policy_clip)
                    weighted_clipped_probs = clipped_probs * advantage[batch]
                    actor_loss = -tf.math.minimum(weighted_probs,
                                                  weighted_clipped_probs)
                    actor_loss = tf.math.reduce_mean(actor_loss)

                    returns = advantage[batch] + values[batch]
                    # critic_loss = tf.math.reduce_mean(tf.math.pow(
                    #                                  returns-critic_value, 2))
                    critic_loss = keras.losses.MSE(critic_value, returns)

                actor_params = self.actor.trainable_variables
                actor_grads = tape.gradient(actor_loss, actor_params)
                critic_params = self.critic.trainable_variables
                critic_grads = tape.gradient(critic_loss, critic_params)
                self.actor.optimizer.apply_gradients(
                    zip(actor_grads, actor_params))
                self.critic.optimizer.apply_gradients(
                    zip(critic_grads, critic_params))

        self.memory.clear_memory()
class Agent:
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1')
        self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2')
        self.value = ValueNetwork(name='value')
        self.target_value = ValueNetwork(name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.value.compile(optimizer=Adam(learning_rate=beta))
        self.target_value.compile(optimizer=Adam(learning_rate=beta))

        self.scale = reward_scale
        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation])
        actions, _ = self.actor.sample_normal(state, reparameterize=False)

        return actions[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_value.weights
        for i, weight in enumerate(self.value.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))

        self.target_value.set_weights(weights)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_weights(self.actor.checkpoint_file)
        self.critic_1.save_weights(self.critic_1.checkpoint_file)
        self.critic_2.save_weights(self.critic_2.checkpoint_file)
        self.value.save_weights(self.value.checkpoint_file)
        self.target_value.save_weights(self.target_value.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor.load_weights(self.actor.checkpoint_file)
        self.critic_1.load_weights(self.critic_1.checkpoint_file)
        self.critic_2.load_weights(self.critic_2.checkpoint_file)
        self.value.load_weights(self.value.checkpoint_file)
        self.target_value.load_weights(self.target_value.checkpoint_file)

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            value = tf.squeeze(self.value(states), 1)
            value_ = tf.squeeze(self.target_value(states_), 1)

            current_policy_actions, log_probs = self.actor.sample_normal(
                states, reparameterize=False)
            log_probs = tf.squeeze(log_probs, 1)
            q1_new_policy = self.critic_1(states, current_policy_actions)
            q2_new_policy = self.critic_2(states, current_policy_actions)
            critic_value = tf.squeeze(
                tf.math.minimum(q1_new_policy, q2_new_policy), 1)

            value_target = critic_value - log_probs
            value_loss = 0.5 * keras.losses.MSE(value, value_target)

        value_network_gradient = tape.gradient(value_loss,
                                               self.value.trainable_variables)
        self.value.optimizer.apply_gradients(
            zip(value_network_gradient, self.value.trainable_variables))

        with tf.GradientTape() as tape:
            # in the original paper, they reparameterize here. We don't implement
            # this so it's just the usual action.
            new_policy_actions, log_probs = self.actor.sample_normal(
                states, reparameterize=True)
            log_probs = tf.squeeze(log_probs, 1)
            q1_new_policy = self.critic_1(states, new_policy_actions)
            q2_new_policy = self.critic_2(states, new_policy_actions)
            critic_value = tf.squeeze(
                tf.math.minimum(q1_new_policy, q2_new_policy), 1)

            actor_loss = log_probs - critic_value
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_network_gradient = tape.gradient(actor_loss,
                                               self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_network_gradient, self.actor.trainable_variables))

        with tf.GradientTape(persistent=True) as tape:
            # I didn't know that these context managers shared values?
            q_hat = self.scale * reward + self.gamma * value_ * (1 - done)
            q1_old_policy = tf.squeeze(self.critic_1(state, action), 1)
            q2_old_policy = tf.squeeze(self.critic_2(state, action), 1)
            critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat)
            critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat)

        critic_1_network_gradient = tape.gradient(
            critic_1_loss, self.critic_1.trainable_variables)
        critic_2_network_gradient = tape.gradient(
            critic_2_loss, self.critic_2.trainable_variables)

        self.critic_1.optimizer.apply_gradients(
            zip(critic_1_network_gradient, self.critic_1.trainable_variables))
        self.critic_2.optimizer.apply_gradients(
            zip(critic_2_network_gradient, self.critic_2.trainable_variables))

        self.update_network_parameters()
class Agent:
    def __init__(self,
                 input_dims,
                 alpha=0.001,
                 beta=0.002,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 fc1=400,
                 fc2=300,
                 batch_size=64,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = ActorNetwork(n_actions=n_actions, name='actor')
        self.critic = CriticNetwork(name='critic')
        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        targets = self.target_actor.weights
        for i, weight in enumerate(self.actor.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))
        self.target_actor.set_weights(weights)

        weights = []
        targets = self.target_critic.weights
        for i, weight in enumerate(self.critic.weights):
            weights.append(weight * tau + targets[i] * (1 - tau))
        self.target_critic.set_weights(weights)

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save_weights(self.actor.checkpoint_file)
        self.target_actor.save_weights(self.target_actor.checkpoint_file)
        self.critic.save_weights(self.critic.checkpoint_file)
        self.target_critic.save_weights(self.target_critic.checkpoint_file)

    def load_models(self):
        print('... loading models ...')
        self.actor.load_weights(self.actor.checkpoint_file)
        self.target_actor.load_weights(self.target_actor.checkpoint_file)
        self.critic.load_weights(self.critic.checkpoint_file)
        self.target_critic.load_weights(self.target_critic.checkpoint_file)

    def choose_action(self, observation, evaluate=False):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        actions = self.actor(state)
        if not evaluate:
            actions += tf.random.normal(shape=[self.n_actions],
                                        mean=0.0,
                                        stddev=self.noise)
        # note that if the env has an action > 1, we have to multiply by
        # max action at some point
        actions = tf.clip_by_value(actions, self.min_action, self.max_action)

        return actions[0]

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        states = tf.convert_to_tensor(state, dtype=tf.float32)
        states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor(states_)
            critic_value_ = tf.squeeze(
                self.target_critic(states_, target_actions), 1)
            critic_value = tf.squeeze(self.critic(states, actions), 1)
            target = rewards + self.gamma * critic_value_ * (1 - done)
            critic_loss = keras.losses.MSE(target, critic_value)

        critic_network_gradient = tape.gradient(
            critic_loss, self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_network_gradient, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states, new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)

        actor_network_gradient = tape.gradient(actor_loss,
                                               self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_network_gradient, self.actor.trainable_variables))

        self.update_network_parameters()
Esempio n. 4
0
class Agent:
    """ 2019 State-of-the-Art Implementation of SAC with optimized temperature

    """
    def __init__(self,
                 env,
                 lr_Q=3e-4,
                 lr_actor=3e-4,
                 lr_a=3e-4,
                 gamma=0.99,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 max_size=1000000,
                 warmup=1000,
                 policy_delay=1,
                 minimum_entropy=None):

        self.env = env
        self.action_range = [env.action_space.low, env.action_space.high]

        self.n_states = env.observation_space.shape[0]
        self.n_actions = env.action_space.shape[0]

        self.min_action = env.action_space.low
        self.max_action = env.action_space.high

        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.warmup = warmup
        self.time_step = 0
        self.update_step = 0
        self.policy_delay = policy_delay

        self.policy_net = ActorNetwork(n_states=self.n_states,
                                       n_actions=self.n_actions,
                                       fc1_dims=layer1_size,
                                       fc2_dims=layer2_size,
                                       network_name='Actor')

        self.q_net1 = CriticNetwork(n_states=self.n_states,
                                    n_actions=self.n_actions,
                                    hidden_neurons_1=layer1_size,
                                    hidden_neurons_2=layer2_size,
                                    network_name='Critic_1')

        self.q_net2 = CriticNetwork(n_states=self.n_states,
                                    n_actions=self.n_actions,
                                    hidden_neurons_1=layer1_size,
                                    hidden_neurons_2=layer2_size,
                                    network_name='Critic_2')

        self.target_q_net1 = CriticNetwork(n_states=self.n_states,
                                           n_actions=self.n_actions,
                                           hidden_neurons_1=layer1_size,
                                           hidden_neurons_2=layer2_size,
                                           network_name='Target_Critic_1')

        self.target_q_net2 = CriticNetwork(n_states=self.n_states,
                                           n_actions=self.n_actions,
                                           hidden_neurons_1=layer1_size,
                                           hidden_neurons_2=layer2_size,
                                           network_name='Target_Critic_2')

        self.replay_buffer = ReplayBuffer(n_actions=self.n_actions,
                                          n_states=self.n_states,
                                          memory_size=max_size)

        self.policy_net.compile(optimizer=tf.keras.optimizers.Adam(
            lr=lr_actor))
        self.q_net1.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_Q))
        self.q_net2.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_Q))

        self.update_target_networks(
            tau=1)  # copy parameters to target networks

        # entropy temperature parameter alpha
        # self.log_alpha = tf.Variable(0.0, dtype=tf.float32)
        print(-tf.constant(env.action_space.shape[0], dtype=tf.float32))

        self.log_alpha = tf.Variable(tf.zeros(1), trainable=True)
        self.minimum_entropy = -tf.reduce_prod(
            tf.convert_to_tensor(env.action_space.shape, dtype=tf.float32))
        self.minimum_entropy = -tf.reduce_prod(
            tf.convert_to_tensor(env.action_space.shape, dtype=tf.float32)
        ) if minimum_entropy is None else minimum_entropy
        print('Minimum Entropy set to: ', self.minimum_entropy)
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_a)
        self.alpha = tf.exp(self.log_alpha).numpy()
        print('alpha: ', self.alpha)

    def choose_action(self, state):
        if self.time_step < self.warmup:
            actions = np.random.uniform(
                low=-1.0, high=1.0, size=self.n_actions
            )  # "random uniform distribution over all valid actions"
            actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        else:
            state = tf.convert_to_tensor(state, dtype=tf.float32)
            state = tf.expand_dims(state, axis=0)
            actions, _ = self.policy_net(state)

        self.time_step += 1
        if self.time_step == self.warmup:
            print('No warmup anymore!')
        a = self.rescale_action(actions[0].numpy())
        return a

    def scale_action(self, action):
        """ Scale all actions to [-1., +1.]

        :param action: unscaled actions
        :return: scaled actions all in range -1. .. +1.
        """
        # old = 2 * (action - self.min_action) / (self.max_action - self.min_action) - 1.0
        scale = (2 * action - (self.action_range[1] + self.action_range[0])) / \
                    (self.action_range[1] - self.action_range[0])
        return scale

    def rescale_action(self, action):
        """ Rescale all scaled actions to environment actionspace values

        :param action: scaled actions
        :return: rescaled actions all in range min_action .. max_action
        """
        # old = (action + 1.0) * (self.max_action - self.min_action) / 2.0 + self.min_action
        rescale = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \
                  (self.action_range[1] + self.action_range[0]) / 2.0
        return rescale

    def remember(self, state, action, reward, new_state, done):
        action = self.scale_action(action)  # ÄNDERUNG! Funktioniert das mit?
        self.replay_buffer.store_environment_transition(
            state, action, reward, new_state, done)

    def update_target_networks(self, tau=None):
        if tau is None:
            tau = self.tau

        weights = []
        for theta_target, theta in zip(self.target_q_net1.get_weights(),
                                       self.q_net1.get_weights()):
            theta_target = tau * theta + (1 - tau) * theta_target
            weights.append(theta_target)
        self.target_q_net1.set_weights(weights)

        weights = []
        for theta_target, theta in zip(self.target_q_net2.get_weights(),
                                       self.q_net2.get_weights()):
            theta_target = tau * theta + (1 - tau) * theta_target
            weights.append(theta_target)
        self.target_q_net2.set_weights(weights)

        # weights = []
        # theta_target = self.target_q_net1.weights
        # for i, theta in enumerate(self.q_net1.weights):
        #    weights.append(tau*theta + (1-tau)*theta_target[i])
        # self.target_q_net1.set_weights(weights)
        #
        # weights = []
        # theta_target = self.target_q_net2.weights
        # for i, theta in enumerate(self.q_net2.weights):
        #    weights.append(tau*theta + (1-tau)*theta_target[i])
        # self.target_q_net2.set_weights(weights)

    def save_models(self):
        print('models saved')  # To Do!

    def load_models(self):
        print('models loaded')  # To Do!

    def learn(self):
        if self.replay_buffer.count < self.batch_size:
            return
        elif self.replay_buffer.count == self.batch_size:
            print('Buffer Size equals batch Size! - Learning begins!')
            return

        # sample batch from replay buffer
        states, actions, rewards, next_states, dones = self.replay_buffer.sample_from_buffer(
            batch_size=self.batch_size)

        # convert batchs from 2D numpy arrays to tensorflow tensors
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)

        # expand rewards and dones from 1D numpy arrays to 2D tensors and reshape them
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        rewards = tf.expand_dims(rewards, axis=0)
        rewards = tf.reshape(rewards, [self.batch_size, 1])
        dones = tf.convert_to_tensor(dones, dtype=tf.float32)
        dones = tf.expand_dims(dones, axis=0)
        dones = tf.reshape(dones, [self.batch_size, 1])

        ## Update critic networks Q1 & Q2
        with tf.GradientTape(persistent=True) as tape_Q:
            next_actions, next_log_pi = self.policy_net(next_states)
            Q1_next = self.target_q_net1(next_states, next_actions)
            Q2_next = self.target_q_net2(next_states, next_actions)
            next_q_target = tf.minimum(Q1_next,
                                       Q2_next) - self.alpha * next_log_pi
            expected_q = tf.stop_gradient(rewards + (1 - dones) * self.gamma *
                                          next_q_target)

            curr_q1 = self.q_net1(states, actions)
            curr_q2 = self.q_net2(states, actions)

            q1_loss = tf.reduce_mean((curr_q1 - expected_q)**2)
            q2_loss = tf.reduce_mean((curr_q2 - expected_q)**2)  # tf.square()
            q_loss = q1_loss + q2_loss

        grad_Q1 = tape_Q.gradient(q_loss, self.q_net1.trainable_variables)
        grad_Q2 = tape_Q.gradient(q_loss, self.q_net2.trainable_variables)

        self.q_net1.optimizer.apply_gradients(
            zip(grad_Q1, self.q_net1.trainable_variables))
        self.q_net2.optimizer.apply_gradients(
            zip(grad_Q2, self.q_net2.trainable_variables))

        ## Update policy network and polyak update target Q networks less frequently (like in TD3 --> "Delayed SAC")
        if self.update_step % self.policy_delay == 0:
            with tf.GradientTape() as tape_policy:
                new_actions, log_pi = self.policy_net(states)
                Q1 = self.q_net1(states, new_actions)
                Q2 = self.q_net2(states, new_actions)
                Q_min = tf.minimum(Q1, Q2)
                loss_policy = tf.reduce_mean(self.alpha * log_pi - Q_min)

            grad_policy = tape_policy.gradient(
                loss_policy, self.policy_net.trainable_variables)
            self.policy_net.optimizer.apply_gradients(
                zip(grad_policy, self.policy_net.trainable_variables))

            self.update_target_networks(
            )  # update target networks with polyak averaging

        ## Update temperature parameter alpha
        with tf.GradientTape() as tape:
            _, log_pi_a = self.policy_net(states)
            alpha_loss = tf.reduce_mean(self.log_alpha *
                                        (-log_pi_a - self.minimum_entropy))

        grads = tape.gradient(alpha_loss, [self.log_alpha])
        self.alpha_optimizer.apply_gradients(zip(grads, [self.log_alpha]))
        self.alpha = tf.exp(self.log_alpha).numpy()

        self.update_step += 1  # Keep track of the number of network updates