Esempio n. 1
0
    def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma,
                 tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6,
                 _run=None):
        """
        An object containing critic, actor and training functions for Multi-Agent DDPG.
        """
        self._run = _run

        assert isinstance(obs_space_n[0], Space)
        obs_shape_n = space_n_to_shape_n(obs_space_n)
        act_shape_n = space_n_to_shape_n(act_space_n)
        super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step,
                         initial_beta,
                         prioritized_replay_eps=prioritized_replay_eps)

        act_type = type(act_space_n[0])
        self.critic = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index)
        self.critic_target = MADDPGCriticNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n, act_type,
                                                 agent_index)
        self.critic_target.model.set_weights(self.critic.model.get_weights())

        self.policy = MADDPGPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1,
                                          self.critic, agent_index)
        self.policy_target = MADDPGPolicyNetwork(num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index],
                                                 act_type, 1,
                                                 self.critic, agent_index)
        self.policy_target.model.set_weights(self.policy.model.get_weights())

        self.batch_size = batch_size
        self.agent_index = agent_index
        self.decay = gamma
        self.tau = tau
Esempio n. 2
0
    def __init__(self, obs_space_n, act_space_n, agent_index, batch_size, buff_size, lr, num_layer, num_units, gamma,
                 tau, prioritized_replay=False, alpha=0.6, max_step=None, initial_beta=0.6, prioritized_replay_eps=1e-6,
                 _run=None, num_atoms=51, min_val=-150, max_val=0):
        """
        Implementation of a Multi-Agent version of D3PG (Distributed Deep Deterministic Policy
        Gradient).

        num_atoms, min_val and max_val control the parametrization of the value function.
        """
        self._run = _run

        assert isinstance(obs_space_n[0], Space)
        obs_shape_n = space_n_to_shape_n(obs_space_n)
        act_shape_n = space_n_to_shape_n(act_space_n)
        super().__init__(buff_size, obs_shape_n, act_shape_n, batch_size, prioritized_replay, alpha, max_step, initial_beta,
                         prioritized_replay_eps=prioritized_replay_eps)

        act_type = type(act_space_n[0])
        self.critic = CatDistCritic(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index, num_atoms, min_val, max_val)
        self.critic_target = CatDistCritic(2, num_units, lr, obs_shape_n, act_shape_n, act_type, agent_index, num_atoms, min_val, max_val)
        self.critic_target.model.set_weights(self.critic.model.get_weights())

        self.policy = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1,
                                          self.critic, agent_index)
        self.policy_target = MADDPGPolicyNetwork(2, num_units, lr, obs_shape_n, act_shape_n[agent_index], act_type, 1,
                                                 self.critic, agent_index)
        self.policy_target.model.set_weights(self.policy.model.get_weights())

        self.batch_size = batch_size
        self.agent_index = agent_index
        self.decay = gamma
        self.tau = tau
Esempio n. 3
0
    def __init__(self,
                 obs_space_n,
                 act_space_n,
                 agent_index,
                 batch_size,
                 buff_size,
                 lr,
                 num_layer,
                 num_units,
                 gamma,
                 tau,
                 prioritized_replay=False,
                 alpha=0.6,
                 max_step=None,
                 initial_beta=0.6,
                 prioritized_replay_eps=1e-6,
                 entropy_coeff=0.2,
                 use_gauss_policy=False,
                 use_gumbel=True,
                 policy_update_freq=1,
                 _run=None,
                 multi_step=1):
        """
        Implementation of Multi-Agent Soft-Actor-Critic, with additional delayed policy updates.
        The implementation here deviates a bit from the standard soft actor critic, by not using the
        value function and target value function, but instead using 2 q functions with 2 targets each.
        Using the value function could also be tested.

        Also the learning of the entropy temperature could still be implemented. Right now setting the entropy
        coefficient is very important.
        todo: entropy temperature learning
        todo: gaussian policy
        note: does not use value function but only two q functions
        note: ensure gumbel softmax entropy is calculated correctly
        """
        self._run = _run
        assert isinstance(obs_space_n[0], Space)
        obs_shape_n = space_n_to_shape_n(obs_space_n)
        act_shape_n = space_n_to_shape_n(act_space_n)
        super().__init__(buff_size,
                         obs_shape_n,
                         act_shape_n,
                         batch_size,
                         prioritized_replay,
                         alpha,
                         max_step,
                         initial_beta,
                         prioritized_replay_eps=prioritized_replay_eps)

        act_type = type(act_space_n[0])
        self.critic_1 = MADDPGCriticNetwork(num_layer, num_units, lr,
                                            obs_shape_n, act_shape_n, act_type,
                                            agent_index)
        self.critic_1_target = MADDPGCriticNetwork(num_layer, num_units, lr,
                                                   obs_shape_n, act_shape_n,
                                                   act_type, agent_index)
        self.critic_1_target.model.set_weights(
            self.critic_1.model.get_weights())
        self.critic_2 = MADDPGCriticNetwork(num_layer, num_units, lr,
                                            obs_shape_n, act_shape_n, act_type,
                                            agent_index)
        self.critic_2_target = MADDPGCriticNetwork(num_layer, num_units, lr,
                                                   obs_shape_n, act_shape_n,
                                                   act_type, agent_index)
        self.critic_2_target.model.set_weights(
            self.critic_2.model.get_weights())

        # this was proposed to be used in the original SAC paper but later they got rid of it again
        self.v_network = ValueFunctionNetwork(num_layer, num_units, lr,
                                              obs_shape_n, act_shape_n,
                                              act_type, agent_index)  # unused
        self.v_network_target = ValueFunctionNetwork(num_layer, num_units, lr,
                                                     obs_shape_n, act_shape_n,
                                                     act_type,
                                                     agent_index)  # unused
        self.v_network_target.model.set_weights(
            self.v_network.model.get_weights())  # unused

        self.policy = MASACPolicyNetwork(num_layer, num_units, lr, obs_shape_n,
                                         act_shape_n[agent_index], act_type, 1,
                                         entropy_coeff, agent_index,
                                         self.critic_1, use_gauss_policy,
                                         use_gumbel, prioritized_replay_eps)
        self.policy_target = MASACPolicyNetwork(
            num_layer, num_units, lr, obs_shape_n, act_shape_n[agent_index],
            act_type, 1, entropy_coeff, agent_index, self.critic_1,
            use_gauss_policy, use_gumbel, prioritized_replay_eps)
        self.policy_target.model.set_weights(self.policy.model.get_weights())

        self.use_gauss_policy = use_gauss_policy
        self.use_gumbel = use_gumbel
        self.policy_update_freq = policy_update_freq

        self.batch_size = batch_size
        self.decay = gamma
        self.tau = tau
        self.entropy_coeff = entropy_coeff
        self.update_counter = 0
        self.agent_index = agent_index
        self.multi_step = multi_step