def test_append(self): count = 100 start_length = count // 2 max_length = count buffer = ReplayBuffer(start_length=start_length, max_length=max_length) for append_count in range(max_length*2): buffer.append(append_count) self.assertEqual(len(buffer.buffer), min(append_count+1, max_length), "Incorrect buffer size.") self.assertEqual(buffer.buffer[0], max(0, (append_count+1) - max_length), "Incorrect first value.") self.assertEqual(buffer.buffer[-1], append_count, "Incorrect last value.")
class Brain: """ The Brain that contains all the models """ def __init__(self, num_states, num_actions, action_high, action_low, gamma=GAMMA, rho=RHO, std_dev=STD_DEV): # initialize everything self.actor_network = ActorNetwork(num_states, num_actions, action_high) self.critic_network = CriticNetwork(num_states, num_actions, action_high) self.actor_target = ActorNetwork(num_states, num_actions, action_high) self.critic_target = CriticNetwork(num_states, num_actions, action_high) # Making the weights equal initially self.actor_target.set_weights(self.actor_network.get_weights()) self.critic_target.set_weights(self.critic_network.get_weights()) self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) self.gamma = tf.constant(gamma) self.rho = rho self.action_high = action_high self.action_low = action_low self.num_states = num_states self.num_actions = num_actions self.noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1)) # optimizers self.critic_optimizer = tf.keras.optimizers.Adam(CRITIC_LR, amsgrad=True) self.actor_optimizer = tf.keras.optimizers.Adam(ACTOR_LR, amsgrad=True) # temporary variable for side effects self.cur_action = None # define update weights with tf.function for improved performance @tf.function(input_signature=[ tf.TensorSpec(shape=(None, num_states), dtype=tf.float32), tf.TensorSpec(shape=(None, num_actions), dtype=tf.float32), tf.TensorSpec(shape=(None, 1), dtype=tf.float32), tf.TensorSpec(shape=(None, num_states), dtype=tf.float32), tf.TensorSpec(shape=(None, 1), dtype=tf.float32), ]) def update_weights(s, a, r, sn, d): """ Function to update weights with optimizer """ with tf.GradientTape() as tape: # define target y = r + self.gamma * (1 - d) * self.critic_target( [sn, self.actor_target(sn)]) # define the delta Q critic_loss = tf.math.reduce_mean( tf.math.abs(y - self.critic_network([s, a]))) critic_grad = tape.gradient( critic_loss, self.critic_network.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic_network.trainable_variables)) with tf.GradientTape() as tape: # define the delta mu actor_loss = -tf.math.reduce_mean( self.critic_network([s, self.actor_network(s)])) actor_grad = tape.gradient(actor_loss, self.actor_network.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_network.trainable_variables)) return critic_loss, actor_loss self.update_weights = update_weights def act(self, state, _notrandom=True, noise=True): """ Run action by the actor network Args: state: the current state _notrandom: whether greedy is used noise: whether noise is to be added to the result action (this improves exploration) Returns: the resulting action """ self.cur_action = ( self.actor_network(state)[0].numpy() if _notrandom else (np.random.uniform(self.action_low, self.action_high, self.num_actions)) + (self.noise() if noise else 0)) self.cur_action = np.clip(self.cur_action, self.action_low, self.action_high) return self.cur_action def remember(self, prev_state, reward, state, done): """ Store states, reward, done value to the buffer """ # record it in the buffer based on its reward self.buffer.append(prev_state, self.cur_action, reward, state, done) def learn(self, entry): """ Run update for all networks (for training) """ s, a, r, sn, d = zip(*entry) c_l, a_l = self.update_weights( tf.convert_to_tensor(s, dtype=tf.float32), tf.convert_to_tensor(a, dtype=tf.float32), tf.convert_to_tensor(r, dtype=tf.float32), tf.convert_to_tensor(sn, dtype=tf.float32), tf.convert_to_tensor(d, dtype=tf.float32)) update_target(self.actor_target, self.actor_network, self.rho) update_target(self.critic_target, self.critic_network, self.rho) return c_l, a_l def save_weights(self, path): """ Save weights to `path` """ parent_dir = os.path.dirname(path) if not os.path.exists(parent_dir): os.makedirs(parent_dir) # Save the weights self.actor_network.save_weights(path + "an.h5") self.critic_network.save_weights(path + "cn.h5") self.critic_target.save_weights(path + "ct.h5") self.actor_target.save_weights(path + "at.h5") def load_weights(self, path): """ Load weights from path """ try: self.actor_network.load_weights(path + "an.h5") self.critic_network.load_weights(path + "cn.h5") self.critic_target.load_weights(path + "ct.h5") self.actor_target.load_weights(path + "at.h5") except OSError as err: logging.warning("Weights files cannot be found, %s", err)