def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, _run=None): """ An object containing critic, actor and training functions for Multi-Agent DDPG. """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_target.model.set_weights(self.critic.model.get_weights()) self.policy = MADDPGPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index) self.policy_target = MADDPGPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.batch_size = batch_size self.agent_index = agent_index self.decay = gamma self.tau = tau
def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, _run=None, num_atoms=51, min_val=-150, max_val=0): """ Implementation of a Multi-Agent version of D3PG (Distributed Deep Deterministic Policy Gradient). num_atoms, min_val and max_val control the parametrization of the value function. """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic = CatDistCritic(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index, num_atoms, min_val, max_val) self.critic_target = CatDistCritic(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index, num_atoms, min_val, max_val) self.critic_target.model.set_weights(self.critic.model.get_weights()) self.policy = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index) self.policy_target = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, self.critic, agent_index) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.batch_size = batch_size self.agent_index = agent_index self.decay = gamma self.tau = tau
def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma, tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6, entropy_coeff=0.2, use_gauss_policy=False, use_gumbel=True, policy_update_freq=1, _run=None, multi_step=1): """ Implementation of Multi-Agent Soft-Actor-Critic, with additional delayed policy updates. The implementation here deviates a bit from the standard soft actor critic, by not using the value function and target value function, but instead using 2 q functions with 2 targets each. Using the value function could also be tested. Also the learning of the entropy temperature could still be implemented. Right now setting the entropy coefficient is very important. todo: entropy temperature learning todo: gaussian policy note: does not use value function but only two q functions note: ensure gumbel softmax entropy is calculated correctly """ self._run = _run assert isinstance(obs_space_n[0], Space) obs_shape_n = space_n_to_shape_n(obs_space_n) act_shape_n = space_n_to_shape_n(act_space_n) super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta, prioritized_replay_eps=prioritized_replay_eps) act_type = type(act_space_n[0]) self.critic_1 = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_1_target.model.set_weights( self.critic_1.model.get_weights()) self.critic_2 = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) self.critic_2_target.model.set_weights( self.critic_2.model.get_weights()) # this was proposed to be used in the original SAC paper but later they got rid of it again self.v_network = ValueFunctionNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) # unused self.v_network_target = ValueFunctionNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index) # unused self.v_network_target.model.set_weights( self.v_network.model.get_weights()) # unused self.policy = MASACPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, entropy_coeff, agent_index, self.critic_1, use_gauss_policy, use_gumbel, prioritized_replay_eps) self.policy_target = MASACPolicyNetwork( num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1, entropy_coeff, agent_index, self.critic_1, use_gauss_policy, use_gumbel, prioritized_replay_eps) self.policy_target.model.set_weights(self.policy.model.get_weights()) self.use_gauss_policy = use_gauss_policy self.use_gumbel = use_gumbel self.policy_update_freq = policy_update_freq self.batch_size = batch_size self.decay = gamma self.tau = tau self.entropy_coeff = entropy_coeff self.update_counter = 0 self.agent_index = agent_index self.multi_step = multi_step