def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, policy_update_freq=2, target_policy_smoothing_eps=0.0, _run=None): """ An object containing critic, actor and training functions for Multi-Agent TD3. """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic_1 = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target.model.set_weights( self.critic_1.model.get_weights()) self.critic_2 = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target.model.set_weights( self.critic_2.model.get_weights()) self.policy = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic_1, agent_index) self.policy_target = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic_1, agent_index) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.batch_size = batch_size self.decay = gamma self.tau = tau self.policy_update_freq = policy_update_freq self.target_policy_smoothing_eps = target_policy_smoothing_eps self.update_counter = 0 self.agent_index = agent_index
def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, entropy_coeff=0.2, use_gauss_policy=False, use_gumbel=True, policy_update_freq=1, _run=None, multi_step=1): """ Implementation of Multi-Agent Soft-Actor-Critic, with additional delayed policy updates. The implementation here deviates a bit from the standard soft actor critic, by not using the value function and target value function, but instead using 2 q functions with 2 targets each. Using the value function could also be tested. Also the learning of the entropy temperature could still be implemented. Right now setting the entropy coefficient is very important. todo: entropy temperature learning todo: gaussian policy note: does not use value function but only two q functions note: ensure gumbel softmax entropy is calculated correctly """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic_1 = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target.model.set_weights( self.critic_1.model.get_weights()) self.critic_2 = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target.model.set_weights( self.critic_2.model.get_weights()) # this was proposed to be used in the original SAC paper but later they got rid of it again self.v_network = ValueFunctionNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) # unused self.v_network_target = ValueFunctionNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) # unused self.v_network_target.model.set_weights( self.v_network.model.get_weights()) # unused self.policy = MASACPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, entropy_coeff, agent_index, self.critic_1, use_gauss_policy, use_gumbel, prioritized_replay_eps) self.policy_target = MASACPolicyNetwork( num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, entropy_coeff, agent_index, self.critic_1, use_gauss_policy, use_gumbel, prioritized_replay_eps) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.use_gauss_policy = use_gauss_policy self.use_gumbel = use_gumbel self.policy_update_freq = policy_update_freq self.batch_size = batch_size self.decay = gamma self.tau = tau self.entropy_coeff = entropy_coeff self.update_counter = 0 self.agent_index = agent_index self.multi_step = multi_step
class RecurrentAgent(AbstractAgent): def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, policy_update_freq=2, target_policy_smoothing_eps=0.0, _run=None): """ An object containing critic, actor and training functions for Multi-Agent TD3. """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic_1 = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target.model.set_weights( self.critic_1.model.get_weights()) self.critic_2 = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target = MADDPGCriticNetwork(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target.model.set_weights( self.critic_2.model.get_weights()) self.policy = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic_1, agent_index) self.policy_target = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic_1, agent_index) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.batch_size = batch_size self.decay = gamma self.tau = tau self.policy_update_freq = policy_update_freq self.target_policy_smoothing_eps = target_policy_smoothing_eps self.update_counter = 0 self.agent_index = agent_index def action(self, obs): """ Get an action from the non-target policy """ return self.policy.get_action(obs[None])[0] def target_action(self, obs): """ Get an action from the non-target policy """ return self.policy_target.get_action(obs) def preupdate(self): pass def update_target_networks(self, tau): """ Implements the updates of the target networks, which slowly follow the real network. """ def update_target_network(net: tf.keras.Model, target_net: tf.keras.Model): net_weights = np.array(net.get_weights()) target_net_weights = np.array(target_net.get_weights()) new_weights = tau * net_weights + (1.0 - tau) * target_net_weights target_net.set_weights(new_weights) update_target_network(self.critic_1.model, self.critic_1_target.model) update_target_network(self.critic_2.model, self.critic_2_target.model) update_target_network(self.policy.model, self.policy_target.model) def update(self, agents, step): """ Update the agent, by first updating the two critics and then the policy. Requires the list of the other agents as input, to determine the target actions. """ assert agents[self.agent_index] is self self.update_counter += 1 if self.prioritized_replay: obs_n, acts_n, rew_n, next_obs_n, done_n, weights, indices = \ self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(step)) self._run.log_scalar( 'agent_{}.train.mean_weight'.format(self.agent_index), np.mean(weights), step) self._run.log_scalar( 'agent_{}.train.max_weight'.format(self.agent_index), np.max(weights), step) else: obs_n, acts_n, rew_n, next_obs_n, done_n = self.replay_buffer.sample( self.batch_size) weights = tf.ones(rew_n.shape) # Train the critic, using the target actions in the target critic network, to determine the # training target (i.e. target in MSE loss) for the critic update. target_act_next = [ ag.target_action(obs) for ag, obs in zip(agents, next_obs_n) ] noise = np.random.normal(0, self.target_policy_smoothing_eps, size=target_act_next[self.agent_index].shape) noise = np.clip(noise, -0.5, 0.5) target_act_next[self.agent_index] += noise critic_outputs = np.empty( [2, self.batch_size], dtype=np.float32 ) # this is a lot faster than python list plus minimum critic_outputs[0] = self.critic_1_target.predict( next_obs_n, target_act_next)[:, 0] critic_outputs[1] = self.critic_2_target.predict( next_obs_n, target_act_next)[:, 0] target_q_next = np.min(critic_outputs, 0)[:, None] q_train_target = rew_n[:, None] + self.decay * target_q_next td_loss = np.empty([2, self.batch_size], dtype=np.float32) td_loss[0] = self.critic_1.train_step(obs_n, acts_n, q_train_target, weights).numpy()[:, 0] td_loss[1] = self.critic_2.train_step(obs_n, acts_n, q_train_target, weights).numpy()[:, 0] max_loss = np.max(td_loss, 0) # Update priorities if using prioritized replay if self.prioritized_replay: self.replay_buffer.update_priorities( indices, max_loss + self.prioritized_replay_eps) if self.update_counter % self.policy_update_freq == 0: # delayed policy updates # Train the policy. policy_loss = self.policy.train(obs_n, acts_n) self._run.log_scalar( 'agent_{}.train.policy_loss'.format(self.agent_index), policy_loss.numpy(), step) # Update target networks. self.update_target_networks(self.tau) else: policy_loss = None self._run.log_scalar('agent_{}.train.q_loss0'.format(self.agent_index), np.mean(td_loss[0]), step) self._run.log_scalar('agent_{}.train.q_loss1'.format(self.agent_index), np.mean(td_loss[1]), step) return [td_loss, policy_loss] def save(self, fp): self.critic_1.model.save_weights(fp + 'critic_1.h5', ) self.critic_1_target.model.save_weights(fp + 'critic_1_target.h5') self.critic_2.model.save_weights(fp + 'critic_2.h5', ) self.critic_2_target.model.save_weights(fp + 'critic_2_target.h5') self.policy.model.save_weights(fp + 'policy.h5') self.policy_target.model.save_weights(fp + 'policy_target.h5') def load(self, fp): self.critic_1.model.load_weights(fp + 'critic_1.h5', ) self.critic_1_target.model.load_weights(fp + 'critic_1_target.h5') self.critic_2.model.load_weights(fp + 'critic_2.h5', ) self.critic_2_target.model.load_weights(fp + 'critic_2_target.h5') self.policy.model.load_weights(fp + 'policy.h5') self.policy_target.model.load_weights(fp + 'policy_target.h5')
class MASACAgent(AbstractAgent): def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, entropy_coeff=0.2, use_gauss_policy=False, use_gumbel=True, policy_update_freq=1, _run=None, multi_step=1): """ Implementation of Multi-Agent Soft-Actor-Critic, with additional delayed policy updates. The implementation here deviates a bit from the standard soft actor critic, by not using the value function and target value function, but instead using 2 q functions with 2 targets each. Using the value function could also be tested. Also the learning of the entropy temperature could still be implemented. Right now setting the entropy coefficient is very important. todo: entropy temperature learning todo: gaussian policy note: does not use value function but only two q functions note: ensure gumbel softmax entropy is calculated correctly """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic_1 = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target.model.set_weights( self.critic_1.model.get_weights()) self.critic_2 = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target.model.set_weights( self.critic_2.model.get_weights()) # this was proposed to be used in the original SAC paper but later they got rid of it again self.v_network = ValueFunctionNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) # unused self.v_network_target = ValueFunctionNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) # unused self.v_network_target.model.set_weights( self.v_network.model.get_weights()) # unused self.policy = MASACPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, entropy_coeff, agent_index, self.critic_1, use_gauss_policy, use_gumbel, prioritized_replay_eps) self.policy_target = MASACPolicyNetwork( num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, entropy_coeff, agent_index, self.critic_1, use_gauss_policy, use_gumbel, prioritized_replay_eps) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.use_gauss_policy = use_gauss_policy self.use_gumbel = use_gumbel self.policy_update_freq = policy_update_freq self.batch_size = batch_size self.decay = gamma self.tau = tau self.entropy_coeff = entropy_coeff self.update_counter = 0 self.agent_index = agent_index self.multi_step = multi_step def action(self, obs): """ Get an action from the non-target policy """ return self.policy.get_action(obs[None])[0] def target_action(self, obs): """ Get an action from the non-target policy """ return self.policy_target.get_action(obs) def preupdate(self): pass def update_target_networks(self, tau): """ Implements the updates of the target networks, which slowly follow the real network. """ def update_target_network(net: tf.keras.Model, target_net: tf.keras.Model): net_weights = np.array(net.get_weights()) target_net_weights = np.array(target_net.get_weights()) new_weights = tau * net_weights + (1.0 - tau) * target_net_weights target_net.set_weights(new_weights) update_target_network(self.v_network.model, self.v_network_target.model) update_target_network(self.critic_1.model, self.critic_1_target.model) update_target_network(self.critic_2.model, self.critic_2_target.model) update_target_network(self.policy.model, self.policy_target.model) def update(self, agents, step): """ Update the agent, by first updating the two critics and then the policy. Requires the list of the other agents as input, to determine the target actions. """ assert agents[self.agent_index] is self self.update_counter += 1 if self.prioritized_replay: obs_n, acts_n, rew_n, next_obs_n, done_n, weights, indices = \ self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(step)) else: obs_n, acts_n, rew_n, next_obs_n, done_n = self.replay_buffer.sample( self.batch_size) weights = tf.ones(rew_n.shape) # Train the critic, using the target actions in the target critic network, to determine the # training target (i.e. target in MSE loss) for the critic update. next_act_sampled_n = [ ag.target_action(next_obs) for ag, next_obs in zip(agents, next_obs_n) ] if self.use_gauss_policy: logact_probs = self.policy.action_logprob( next_obs_n[self.agent_index], next_act_sampled_n[self.agent_index] )[:, None] # only our own entropy is 'controllable' entropy = -logact_probs elif self.use_gumbel: action_probs = self.policy.get_all_action_probs( next_obs_n[self.agent_index]) action_log_probs = np.log(action_probs + self.prioritized_replay_eps) buff = -action_probs * action_log_probs entropy = np.sum(buff, 1) critic_outputs = np.empty( [2, self.batch_size], dtype=np.float32 ) # this is a lot faster than python list plus minimum critic_outputs[0] = self.critic_1_target.predict( next_obs_n, next_act_sampled_n)[:, 0] critic_outputs[1] = self.critic_2_target.predict( next_obs_n, next_act_sampled_n)[:, 0] q_min = np.min(critic_outputs, 0)[:, None] target_q = rew_n[:, None] + self.decay * (q_min + self.entropy_coeff * entropy) #### Separate Value Function version #### # target_q = rew_n[:, None] + self.decay * self.v_network_target.predict(next_obs_n) # # sac does this "cross" updating between Q and V functions # # critic_outputs = np.empty([2, self.batch_size], dtype=np.float32) # this is a lot faster than python list plus minimum # critic_outputs[0] = self.critic_1.predict(obs_n, act_sampled_n)[:, 0] # critic_outputs[1] = self.critic_2.predict(obs_n, act_sampled_n)[:, 0] # q_min = np.min(critic_outputs, 0)[:, None] # target_v = q_min + self.entropy_coeff * entropy td_loss = np.empty([2, self.batch_size], dtype=np.float32) td_loss[0] = self.critic_1.train_step(obs_n, acts_n, target_q, weights).numpy()[:, 0] td_loss[1] = self.critic_2.train_step(obs_n, acts_n, target_q, weights).numpy()[:, 0] # v_loss = self.v_network.train_step(obs_n, target_v, weights).numpy()[:, 0] td_loss_max = np.max(td_loss, 0) # Update priorities if using prioritized replay if self.prioritized_replay: self.replay_buffer.update_priorities( indices, td_loss_max + self.prioritized_replay_eps) # Train the policy. if self.update_counter % self.policy_update_freq == 0: # delayed policy updates policy_loss = self.policy.train(obs_n, acts_n) # Update target networks. self.update_target_networks(self.tau) self._run.log_scalar( 'agent_{}.train.policy_loss'.format(self.agent_index), policy_loss.numpy(), step) else: policy_loss = None self._run.log_scalar('agent_{}.train.q_loss0'.format(self.agent_index), np.mean(td_loss[0]), step) self._run.log_scalar('agent_{}.train.q_loss1'.format(self.agent_index), np.mean(td_loss[1]), step) self._run.log_scalar('agent_{}.train.entropy'.format(self.agent_index), np.mean(entropy), step) return [td_loss, policy_loss] def save(self, fp): self.critic_1.model.save_weights(fp + 'critic_1.h5', ) self.critic_2.model.save_weights(fp + 'critic_2.h5') self.critic_1_target.model.save_weights(fp + 'critic_target_1.h5', ) self.critic_2_target.model.save_weights(fp + 'critic_target_2.h5') self.v_network.model.save_weights(fp + 'value.h5') self.v_network_target.model.save_weights(fp + 'value_target.h5') self.policy.model.save_weights(fp + 'policy.h5') self.policy_target.model.save_weights(fp + 'policy_target.h5') def load(self, fp): self.critic_1.model.load_weights(fp + 'critic_1.h5') self.critic_2.model.load_weights(fp + 'critic_2.h5') self.critic_1_target.model.load_weights(fp + 'critic_target_1.h5', ) self.critic_2_target.model.load_weights(fp + 'critic_target_2.h5') self.v_network.model.load_weights(fp + 'value.h5') self.v_network_target.model.load_weights(fp + 'value_target.h5') self.policy.model.load_weights(fp + 'policy.h5') self.policy_target.model.load_weights(fp + 'policy_target.h5')