class Agent(object): def __init__(self, state_space, action_space, max_action, device): self.state_size = state_space.shape[0] self.action_size = action_space.shape[0] self.max_action = max_action self.device = device self.actor_local = Actor(state_space.shape, action_space.high.size, max_action) self.actor_target = Actor(state_space.shape, action_space.high.size, max_action) self.actor_optimizer = optimizers.Adam(LR_ACTOR) # let target be equal to local self.actor_target.set_weights(self.actor_local.get_weights()) self.critic_local = Critic(state_space.shape, action_space.high.size) self.critic_target = Critic(state_space.shape, action_space.high.size) self.critic_optimizer = optimizers.Adam(LR_CRITIC) # let target be equal to local self.critic_target.set_weights(self.critic_local.get_weights()) self.noise = OUNoise(self.action_size) self.memory = ReplayBuffer(BUFFER_SIZE) self.current_steps = 0 def step(self, state, action, reward, done, next_state, train=True) -> None: self.memory.store(state, action, reward, done, next_state) if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE: if self.current_steps % UPDATE_STEPS == 0: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) self.current_steps += 1 @tf.function def critic_train(self, states, actions, rewards, dones, next_states): with tf.device(self.device): # Compute yi u_t = self.actor_target(next_states) q_t = self.critic_target([next_states, u_t]) yi = tf.cast(rewards, dtype=tf.float64) + \ tf.cast(GAMMA, dtype=tf.float64) * \ tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \ tf.cast(q_t, dtype=tf.float64) # Compute MSE with tf.GradientTape() as tape: q_l = tf.cast(self.critic_local([states, actions]), dtype=tf.float64) loss = (q_l - yi) * (q_l - yi) loss = tf.reduce_mean(loss) # Update critic by minimizing loss dloss_dql = tape.gradient(loss, self.critic_local.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss_dql, self.critic_local.trainable_weights)) return @tf.function def actor_train(self, states): with tf.device(self.device): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.actor_local.trainable_variables) u_l = self.actor_local(states) q_l = -tf.reduce_mean(self.critic_local([states, u_l])) j = tape.gradient(q_l, self.actor_local.trainable_variables) self.actor_optimizer.apply_gradients( zip(j, self.actor_local.trainable_variables)) return def learn(self, experiences, gamma) -> None: states, actions, rewards, dones, next_states = experiences states = np.array(states).reshape(BATCH_SIZE, self.state_size) states = tf.convert_to_tensor(states) actions = np.array(actions).reshape(BATCH_SIZE, self.action_size) actions = tf.convert_to_tensor(actions) rewards = np.array(rewards).reshape(BATCH_SIZE, 1) next_states = np.array(next_states).reshape(BATCH_SIZE, self.state_size) dones = np.array(dones).reshape(BATCH_SIZE, 1) self.critic_train(states, actions, rewards, dones, next_states) self.actor_train(states) self.update_local() return def update_local(self): def soft_updates(local_model: tf.keras.Model, target_model: tf.keras.Model) -> np.ndarray: local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = TAU * local_weights + (1 - TAU) * target_weights return new_weights self.actor_target.set_weights( soft_updates(self.actor_local, self.actor_target)) self.critic_target.set_weights( soft_updates(self.critic_local, self.critic_target)) def store_weights(self, episode: int) -> None: self.actor_target.save_weights( join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}')) self.critic_target.save_weights( join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}')) return def act(self, state, add_noise=True) -> (float, float): state = np.array(state).reshape(1, self.state_size) pure_action = self.actor_local.predict(state)[0] action = self.noise.get_action(pure_action) return action, pure_action def reset(self): self.noise.reset()
class Agent(object): def __init__(self, state_size, action_size, max_action, minibatch_size, a_lr, c_lr, gamma, tau): self.state_size = state_size self.action_size = action_size self.max_action = max_action self.critic_lr = c_lr self.actor_lr = a_lr self.actor_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.actor_target_network = Actor(self.state_size, self.action_size, self.max_action, self.actor_lr) self.critic_network = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target_network = Critic(self.state_size, self.action_size, self.critic_lr) self.actor_target_network.set_weights(self.actor_network.get_weights()) self.critic_target_network.set_weights( self.critic_network.get_weights()) self.critic_optimizer = optimizers.Adam(learning_rate=self.critic_lr) self.actor_optimizer = optimizers.Adam(learning_rate=self.actor_lr) self.replay_buffer = ReplayBuffer(1e6) self.MINIBATCH_SIZE = minibatch_size self.GAMMA = tf.cast(gamma, dtype=tf.float64) self.TAU = tau self.noise = OUNoise(self.action_size) def step(self, s, a, r, s_1, t, train=True): self.replay_buffer.add(s, a, r, s_1, t) if (train and self.replay_buffer.size() >= self.MINIBATCH_SIZE): minibatch = self.replay_buffer.sample_batch(self.MINIBATCH_SIZE) self.learn(minibatch) @tf.function def critic_train(self, minibatch): s_batch, a_batch, r_batch, s_1_batch, t_batch = minibatch mu_prime = self.actor_target_network(s_1_batch) q_prime = self.critic_target_network([s_1_batch, mu_prime]) ys = r_batch + self.GAMMA * (1 - t_batch) * q_prime with tf.GradientTape() as tape: predicted_qs = self.critic_network([s_batch, a_batch]) loss = (predicted_qs - ys) * (predicted_qs - ys) loss = tf.reduce_mean(loss) dloss = tape.gradient(loss, self.critic_network.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss, self.critic_network.trainable_weights)) def actor_train(self, minibatch): s_batch, _, _, _, _ = minibatch with tf.GradientTape() as tape: next_action = self.actor_network(s_batch) actor_loss = -tf.reduce_mean( self.critic_network([s_batch, next_action])) actor_grad = tape.gradient(actor_loss, self.actor_network.trainable_weights) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_network.trainable_weights)) def learn(self, minibatch): s, a, r, s_1, t = minibatch s = np.array(s, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.state_size) s = tf.convert_to_tensor(s) a = np.array(a, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.action_size) a = tf.convert_to_tensor(a) r = np.array(r, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1) s_1 = np.array(s_1, dtype=np.float64).reshape(self.MINIBATCH_SIZE, self.state_size) s_1 = tf.convert_to_tensor(s_1) t = np.array(t, dtype=np.float64).reshape(self.MINIBATCH_SIZE, 1) minibatch = (s, a, r, s_1, t) self.critic_train(minibatch) self.actor_train(minibatch) self.update_target_networks() def act(self, state, t=0): state = np.array(state).reshape(1, self.state_size) action = self.actor_network(state)[0] noisy = self.noise.get_action(action, t) return action, noisy def update_target_networks(self): self.actor_target_network.set_weights( np.array(self.actor_network.get_weights()) * self.TAU + np.array(self.actor_target_network.get_weights()) * (1 - self.TAU)) self.critic_target_network.set_weights( np.array(self.critic_network.get_weights()) * self.TAU + np.array(self.critic_target_network.get_weights()) * (1 - self.TAU))