class Agent: def __init__(self, n_actions, input_dims, gamma=0.99, alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, n_epochs=10, chkpt_dir='models/'): self.gamma = gamma self.policy_clip = policy_clip self.n_epochs = n_epochs self.gae_lambda = gae_lambda self.chkpt_dir = chkpt_dir self.actor = ActorNetwork(n_actions) self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic = CriticNetwork() self.critic.compile(optimizer=Adam(learning_rate=alpha)) self.memory = PPOMemory(batch_size) def store_transition(self, state, action, probs, vals, reward, done): self.memory.store_memory(state, action, probs, vals, reward, done) def save_models(self): print('... saving models ...') self.actor.save(self.chkpt_dir + 'actor') self.critic.save(self.chkpt_dir + 'critic') def load_models(self): print('... loading models ...') self.actor = keras.models.load_model(self.chkpt_dir + 'actor') self.critic = keras.models.load_model(self.chkpt_dir + 'critic') def choose_action(self, observation): state = tf.convert_to_tensor([observation]) probs = self.actor(state) dist = tfp.distributions.Categorical(probs) action = dist.sample() log_prob = dist.log_prob(action) value = self.critic(state) action = action.numpy()[0] value = value.numpy()[0] log_prob = log_prob.numpy()[0] return action, log_prob, value def learn(self): for _ in range(self.n_epochs): state_arr, action_arr, old_prob_arr, vals_arr,\ reward_arr, dones_arr, batches = \ self.memory.generate_batches() values = vals_arr advantage = np.zeros(len(reward_arr), dtype=np.float32) for t in range(len(reward_arr) - 1): discount = 1 a_t = 0 for k in range(t, len(reward_arr) - 1): a_t += discount * (reward_arr[k] + self.gamma * values[k + 1] * (1 - int(dones_arr[k])) - values[k]) discount *= self.gamma * self.gae_lambda advantage[t] = a_t for batch in batches: with tf.GradientTape(persistent=True) as tape: states = tf.convert_to_tensor(state_arr[batch]) old_probs = tf.convert_to_tensor(old_prob_arr[batch]) actions = tf.convert_to_tensor(action_arr[batch]) probs = self.actor(states) dist = tfp.distributions.Categorical(probs) new_probs = dist.log_prob(actions) critic_value = self.critic(states) critic_value = tf.squeeze(critic_value, 1) prob_ratio = tf.math.exp(new_probs - old_probs) weighted_probs = advantage[batch] * prob_ratio clipped_probs = tf.clip_by_value(prob_ratio, 1 - self.policy_clip, 1 + self.policy_clip) weighted_clipped_probs = clipped_probs * advantage[batch] actor_loss = -tf.math.minimum(weighted_probs, weighted_clipped_probs) actor_loss = tf.math.reduce_mean(actor_loss) returns = advantage[batch] + values[batch] # critic_loss = tf.math.reduce_mean(tf.math.pow( # returns-critic_value, 2)) critic_loss = keras.losses.MSE(critic_value, returns) actor_params = self.actor.trainable_variables actor_grads = tape.gradient(actor_loss, actor_params) critic_params = self.critic.trainable_variables critic_grads = tape.gradient(critic_loss, critic_params) self.actor.optimizer.apply_gradients( zip(actor_grads, actor_params)) self.critic.optimizer.apply_gradients( zip(critic_grads, critic_params)) self.memory.clear_memory()
class Agent: def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = tf.convert_to_tensor([observation]) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_value.weights for i, weight in enumerate(self.value.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_value.set_weights(weights) def save_models(self): print('... saving models ...') self.actor.save_weights(self.actor.checkpoint_file) self.critic_1.save_weights(self.critic_1.checkpoint_file) self.critic_2.save_weights(self.critic_2.checkpoint_file) self.value.save_weights(self.value.checkpoint_file) self.target_value.save_weights(self.target_value.checkpoint_file) def load_models(self): print('... loading models ...') self.actor.load_weights(self.actor.checkpoint_file) self.critic_1.load_weights(self.critic_1.checkpoint_file) self.critic_2.load_weights(self.critic_2.checkpoint_file) self.value.load_weights(self.value.checkpoint_file) self.target_value.load_weights(self.target_value.checkpoint_file) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = tf.convert_to_tensor(state, dtype=tf.float32) states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) rewards = tf.convert_to_tensor(reward, dtype=tf.float32) actions = tf.convert_to_tensor(action, dtype=tf.float32) with tf.GradientTape() as tape: value = tf.squeeze(self.value(states), 1) value_ = tf.squeeze(self.target_value(states_), 1) current_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=False) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, current_policy_actions) q2_new_policy = self.critic_2(states, current_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) value_target = critic_value - log_probs value_loss = 0.5 * keras.losses.MSE(value, value_target) value_network_gradient = tape.gradient(value_loss, self.value.trainable_variables) self.value.optimizer.apply_gradients( zip(value_network_gradient, self.value.trainable_variables)) with tf.GradientTape() as tape: # in the original paper, they reparameterize here. We don't implement # this so it's just the usual action. new_policy_actions, log_probs = self.actor.sample_normal( states, reparameterize=True) log_probs = tf.squeeze(log_probs, 1) q1_new_policy = self.critic_1(states, new_policy_actions) q2_new_policy = self.critic_2(states, new_policy_actions) critic_value = tf.squeeze( tf.math.minimum(q1_new_policy, q2_new_policy), 1) actor_loss = log_probs - critic_value actor_loss = tf.math.reduce_mean(actor_loss) actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_network_gradient, self.actor.trainable_variables)) with tf.GradientTape(persistent=True) as tape: # I didn't know that these context managers shared values? q_hat = self.scale * reward + self.gamma * value_ * (1 - done) q1_old_policy = tf.squeeze(self.critic_1(state, action), 1) q2_old_policy = tf.squeeze(self.critic_2(state, action), 1) critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat) critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat) critic_1_network_gradient = tape.gradient( critic_1_loss, self.critic_1.trainable_variables) critic_2_network_gradient = tape.gradient( critic_2_loss, self.critic_2.trainable_variables) self.critic_1.optimizer.apply_gradients( zip(critic_1_network_gradient, self.critic_1.trainable_variables)) self.critic_2.optimizer.apply_gradients( zip(critic_2_network_gradient, self.critic_2.trainable_variables)) self.update_network_parameters()
class Agent: def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fc1=400, fc2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = ActorNetwork(n_actions=n_actions, name='actor') self.critic = CriticNetwork(name='critic') self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(name='target_critic') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic.compile(optimizer=Adam(learning_rate=beta)) self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=Adam(learning_rate=beta)) self.update_network_parameters(tau=1) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_actor.weights for i, weight in enumerate(self.actor.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_actor.set_weights(weights) weights = [] targets = self.target_critic.weights for i, weight in enumerate(self.critic.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_critic.set_weights(weights) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def save_models(self): print('... saving models ...') self.actor.save_weights(self.actor.checkpoint_file) self.target_actor.save_weights(self.target_actor.checkpoint_file) self.critic.save_weights(self.critic.checkpoint_file) self.target_critic.save_weights(self.target_critic.checkpoint_file) def load_models(self): print('... loading models ...') self.actor.load_weights(self.actor.checkpoint_file) self.target_actor.load_weights(self.target_actor.checkpoint_file) self.critic.load_weights(self.critic.checkpoint_file) self.target_critic.load_weights(self.target_critic.checkpoint_file) def choose_action(self, observation, evaluate=False): state = tf.convert_to_tensor([observation], dtype=tf.float32) actions = self.actor(state) if not evaluate: actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=self.noise) # note that if the env has an action > 1, we have to multiply by # max action at some point actions = tf.clip_by_value(actions, self.min_action, self.max_action) return actions[0] def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = tf.convert_to_tensor(state, dtype=tf.float32) states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) rewards = tf.convert_to_tensor(reward, dtype=tf.float32) actions = tf.convert_to_tensor(action, dtype=tf.float32) with tf.GradientTape() as tape: target_actions = self.target_actor(states_) critic_value_ = tf.squeeze( self.target_critic(states_, target_actions), 1) critic_value = tf.squeeze(self.critic(states, actions), 1) target = rewards + self.gamma * critic_value_ * (1 - done) critic_loss = keras.losses.MSE(target, critic_value) critic_network_gradient = tape.gradient( critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(critic_network_gradient, self.critic.trainable_variables)) with tf.GradientTape() as tape: new_policy_actions = self.actor(states) actor_loss = -self.critic(states, new_policy_actions) actor_loss = tf.math.reduce_mean(actor_loss) actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_network_gradient, self.actor.trainable_variables)) self.update_network_parameters()
class Agent: """ 2019 State-of-the-Art Implementation of SAC with optimized temperature """ def __init__(self, env, lr_Q=3e-4, lr_actor=3e-4, lr_a=3e-4, gamma=0.99, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, max_size=1000000, warmup=1000, policy_delay=1, minimum_entropy=None): self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.min_action = env.action_space.low self.max_action = env.action_space.high self.gamma = gamma self.tau = tau self.batch_size = batch_size self.warmup = warmup self.time_step = 0 self.update_step = 0 self.policy_delay = policy_delay self.policy_net = ActorNetwork(n_states=self.n_states, n_actions=self.n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size, network_name='Actor') self.q_net1 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions, hidden_neurons_1=layer1_size, hidden_neurons_2=layer2_size, network_name='Critic_1') self.q_net2 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions, hidden_neurons_1=layer1_size, hidden_neurons_2=layer2_size, network_name='Critic_2') self.target_q_net1 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions, hidden_neurons_1=layer1_size, hidden_neurons_2=layer2_size, network_name='Target_Critic_1') self.target_q_net2 = CriticNetwork(n_states=self.n_states, n_actions=self.n_actions, hidden_neurons_1=layer1_size, hidden_neurons_2=layer2_size, network_name='Target_Critic_2') self.replay_buffer = ReplayBuffer(n_actions=self.n_actions, n_states=self.n_states, memory_size=max_size) self.policy_net.compile(optimizer=tf.keras.optimizers.Adam( lr=lr_actor)) self.q_net1.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_Q)) self.q_net2.compile(optimizer=tf.keras.optimizers.Adam(lr=lr_Q)) self.update_target_networks( tau=1) # copy parameters to target networks # entropy temperature parameter alpha # self.log_alpha = tf.Variable(0.0, dtype=tf.float32) print(-tf.constant(env.action_space.shape[0], dtype=tf.float32)) self.log_alpha = tf.Variable(tf.zeros(1), trainable=True) self.minimum_entropy = -tf.reduce_prod( tf.convert_to_tensor(env.action_space.shape, dtype=tf.float32)) self.minimum_entropy = -tf.reduce_prod( tf.convert_to_tensor(env.action_space.shape, dtype=tf.float32) ) if minimum_entropy is None else minimum_entropy print('Minimum Entropy set to: ', self.minimum_entropy) self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_a) self.alpha = tf.exp(self.log_alpha).numpy() print('alpha: ', self.alpha) def choose_action(self, state): if self.time_step < self.warmup: actions = np.random.uniform( low=-1.0, high=1.0, size=self.n_actions ) # "random uniform distribution over all valid actions" actions = tf.convert_to_tensor(actions, dtype=tf.float32) else: state = tf.convert_to_tensor(state, dtype=tf.float32) state = tf.expand_dims(state, axis=0) actions, _ = self.policy_net(state) self.time_step += 1 if self.time_step == self.warmup: print('No warmup anymore!') a = self.rescale_action(actions[0].numpy()) return a def scale_action(self, action): """ Scale all actions to [-1., +1.] :param action: unscaled actions :return: scaled actions all in range -1. .. +1. """ # old = 2 * (action - self.min_action) / (self.max_action - self.min_action) - 1.0 scale = (2 * action - (self.action_range[1] + self.action_range[0])) / \ (self.action_range[1] - self.action_range[0]) return scale def rescale_action(self, action): """ Rescale all scaled actions to environment actionspace values :param action: scaled actions :return: rescaled actions all in range min_action .. max_action """ # old = (action + 1.0) * (self.max_action - self.min_action) / 2.0 + self.min_action rescale = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \ (self.action_range[1] + self.action_range[0]) / 2.0 return rescale def remember(self, state, action, reward, new_state, done): action = self.scale_action(action) # ÄNDERUNG! Funktioniert das mit? self.replay_buffer.store_environment_transition( state, action, reward, new_state, done) def update_target_networks(self, tau=None): if tau is None: tau = self.tau weights = [] for theta_target, theta in zip(self.target_q_net1.get_weights(), self.q_net1.get_weights()): theta_target = tau * theta + (1 - tau) * theta_target weights.append(theta_target) self.target_q_net1.set_weights(weights) weights = [] for theta_target, theta in zip(self.target_q_net2.get_weights(), self.q_net2.get_weights()): theta_target = tau * theta + (1 - tau) * theta_target weights.append(theta_target) self.target_q_net2.set_weights(weights) # weights = [] # theta_target = self.target_q_net1.weights # for i, theta in enumerate(self.q_net1.weights): # weights.append(tau*theta + (1-tau)*theta_target[i]) # self.target_q_net1.set_weights(weights) # # weights = [] # theta_target = self.target_q_net2.weights # for i, theta in enumerate(self.q_net2.weights): # weights.append(tau*theta + (1-tau)*theta_target[i]) # self.target_q_net2.set_weights(weights) def save_models(self): print('models saved') # To Do! def load_models(self): print('models loaded') # To Do! def learn(self): if self.replay_buffer.count < self.batch_size: return elif self.replay_buffer.count == self.batch_size: print('Buffer Size equals batch Size! - Learning begins!') return # sample batch from replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample_from_buffer( batch_size=self.batch_size) # convert batchs from 2D numpy arrays to tensorflow tensors states = tf.convert_to_tensor(states, dtype=tf.float32) actions = tf.convert_to_tensor(actions, dtype=tf.float32) next_states = tf.convert_to_tensor(next_states, dtype=tf.float32) # expand rewards and dones from 1D numpy arrays to 2D tensors and reshape them rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) rewards = tf.expand_dims(rewards, axis=0) rewards = tf.reshape(rewards, [self.batch_size, 1]) dones = tf.convert_to_tensor(dones, dtype=tf.float32) dones = tf.expand_dims(dones, axis=0) dones = tf.reshape(dones, [self.batch_size, 1]) ## Update critic networks Q1 & Q2 with tf.GradientTape(persistent=True) as tape_Q: next_actions, next_log_pi = self.policy_net(next_states) Q1_next = self.target_q_net1(next_states, next_actions) Q2_next = self.target_q_net2(next_states, next_actions) next_q_target = tf.minimum(Q1_next, Q2_next) - self.alpha * next_log_pi expected_q = tf.stop_gradient(rewards + (1 - dones) * self.gamma * next_q_target) curr_q1 = self.q_net1(states, actions) curr_q2 = self.q_net2(states, actions) q1_loss = tf.reduce_mean((curr_q1 - expected_q)**2) q2_loss = tf.reduce_mean((curr_q2 - expected_q)**2) # tf.square() q_loss = q1_loss + q2_loss grad_Q1 = tape_Q.gradient(q_loss, self.q_net1.trainable_variables) grad_Q2 = tape_Q.gradient(q_loss, self.q_net2.trainable_variables) self.q_net1.optimizer.apply_gradients( zip(grad_Q1, self.q_net1.trainable_variables)) self.q_net2.optimizer.apply_gradients( zip(grad_Q2, self.q_net2.trainable_variables)) ## Update policy network and polyak update target Q networks less frequently (like in TD3 --> "Delayed SAC") if self.update_step % self.policy_delay == 0: with tf.GradientTape() as tape_policy: new_actions, log_pi = self.policy_net(states) Q1 = self.q_net1(states, new_actions) Q2 = self.q_net2(states, new_actions) Q_min = tf.minimum(Q1, Q2) loss_policy = tf.reduce_mean(self.alpha * log_pi - Q_min) grad_policy = tape_policy.gradient( loss_policy, self.policy_net.trainable_variables) self.policy_net.optimizer.apply_gradients( zip(grad_policy, self.policy_net.trainable_variables)) self.update_target_networks( ) # update target networks with polyak averaging ## Update temperature parameter alpha with tf.GradientTape() as tape: _, log_pi_a = self.policy_net(states) alpha_loss = tf.reduce_mean(self.log_alpha * (-log_pi_a - self.minimum_entropy)) grads = tape.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients(zip(grads, [self.log_alpha])) self.alpha = tf.exp(self.log_alpha).numpy() self.update_step += 1 # Keep track of the number of network updates