Exemple #1
0
    def __init__(self, observation_space,
                 action_space,
                 replay_buffer,
                 hidden_sizes=256,
                 critic_lr=0.001,
                 actor_lr=0.002,
                 batch_size=32,
                 gamma=0.90,
                 tau=0.01):

        self.observation_space = observation_space
        self.action_space = action_space
        self.replay_buffer = replay_buffer
        self.hidden_sizes = hidden_sizes
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

        self.critic = CriticNetwork(observation_space=observation_space, hidden_sizes=hidden_sizes).to(self.device)
        self.target_critic = CriticNetwork(observation_space=observation_space, hidden_sizes=hidden_sizes).to(self.device)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr)
        hard_update(self.target_critic, self.critic)

        self.actor = ActorNetwork(observation_space=observation_space, action_space=action_space,
                                  hidden_sizes=hidden_sizes).to(self.device)
        self.target_actor = ActorNetwork(observation_space=observation_space, action_space=action_space,
                                         hidden_sizes=hidden_sizes).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr)
        hard_update(self.target_actor, self.actor)

        self.loss_fuc = torch.nn.MSELoss()
Exemple #2
0
    def __init__(self,
                 agent_init_params,
                 sa_size,
                 gamma=0.95,
                 tau=0.01,
                 attend_tau=0.002,
                 pi_lr=0.01,
                 q_lr=0.01,
                 reward_scale=10.,
                 pol_hidden_dim=128,
                 critic_hidden_dim=128,
                 attend_heads=4,
                 **kwargs):
        """
        Inputs:
            agent_init_params (list of dict): List of dicts with parameters to
                                              initialize each agent
                num_in_pol (int): Input dimensions to policy
                num_out_pol (int): Output dimensions to policy
            sa_size (list of (int, int)): Size of state and action space for
                                          each agent
            gamma (float): Discount factor
            tau (float): Target update rate
            pi_lr (float): Learning rate for policy
            q_lr (float): Learning rate for critic
            reward_scale (float): Scaling for reward (has effect of optimal
                                  policy entropy)
            hidden_dim (int): Number of hidden dimensions for networks
        """
        self.nagents = len(sa_size)

        self.agents = [
            AttentionAgent(lr=pi_lr, hidden_dim=pol_hidden_dim, **params)
            for params in agent_init_params
        ]
        self.critic = AttentionCritic(sa_size,
                                      hidden_dim=critic_hidden_dim,
                                      attend_heads=attend_heads)
        self.target_critic = AttentionCritic(sa_size,
                                             hidden_dim=critic_hidden_dim,
                                             attend_heads=attend_heads)
        hard_update(self.target_critic, self.critic)
        self.critic_optimizer = Adam(self.critic.q_parameters(),
                                     lr=q_lr,
                                     weight_decay=1e-3)
        self.agent_init_params = agent_init_params
        self.gamma = gamma
        self.tau = tau
        self.attend_tau = attend_tau
        self.pi_lr = pi_lr
        self.q_lr = q_lr
        self.reward_scale = reward_scale
        self.pol_dev = 'cpu'  # device for policies
        self.critic_dev = 'cpu'  # device for critics
        self.trgt_pol_dev = 'cpu'  # device for target policies
        self.trgt_critic_dev = 'cpu'  # device for target critics
        self.niter = 0
Exemple #3
0
 def update_all_targets(self):
     """
     Update all target networks (called after normal updates have been
     performed for each agent)
     """
     if self.hard_update_interval is None:
         soft_update(self.target_critic, self.critic, self.tau)
         for a in self.agents:
             soft_update(a.target_policy, a.policy, self.tau)
     elif self.niter % self.hard_update_interval == 0:
         hard_update(self.target_critic, self.critic)
         for a in self.agents:
             hard_update(a.target_policy, a.policy)
Exemple #4
0
    def __init__(self, num_out_pol, hidden_dim=128,
                 lr=0.01, onehot_dim=0):
        """
        Inputs:
            num_in_pol (int): number of dimensions for policy input
            num_out_pol (int): number of dimensions for policy output
        """
        self.policy = DiscretePolicy(num_out_pol,
                                     hidden_dim=hidden_dim,
                                     onehot_dim=onehot_dim)
        self.target_policy = DiscretePolicy(num_out_pol,
                                            hidden_dim=hidden_dim,
                                            onehot_dim=onehot_dim)

        hard_update(self.target_policy, self.policy)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
Exemple #5
0
    def __init__(self,
                 num_in_pol,
                 num_out_pol,
                 hidden_dim=64,
                 lr=0.01,
                 onehot_dim=0):
        self.policy = DiscretePolicy(num_in_pol,
                                     num_out_pol,
                                     hidden_dim=hidden_dim,
                                     onehot_dim=onehot_dim)
        self.target_policy = DiscretePolicy(num_in_pol,
                                            num_out_pol,
                                            hidden_dim=hidden_dim,
                                            onehot_dim=onehot_dim)

        hard_update(self.target_policy, self.policy)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
Exemple #6
0
 def update_all_targets(self):
     """
     Update all target networks (called after normal updates have been
     performed for each agent)
     """
     if self.commonCritic:
         soft_update(self.target_critic, self.critic, self.tau)
     for a_i in range(len(self.agents)):
         a = self.agents[a_i]
         if not self.commonCritic:
             soft_update(a.target_critic, a.critic, self.tau)
         if a_i == 0:
             soft_update(a.target_policy, a.policy, self.tau)
         else:
             hard_update(a.policy, self.agents[0].policy)
             soft_update(a.target_policy, a.policy, self.tau)
     self.niter += 1
Exemple #7
0
    def __init__(self, algo_config: List[Tuple[int, int]],
                 gamma=0.95, tau=0.01, pi_lr=0.01, q_lr=0.01,
                 reward_scale=10.,
                 pol_hidden_dim=128,
                 critic_hidden_dim=128, attend_heads=4,
                 **kwargs):
        """
        Inputs:
            algo_config (List[Tuple[int, int]]): Agent types which will exist in this environment
                Ex. [(20, 8), (20, 2)]
            gamma (float): Discount factor
            tau (float): Target update rate
            pi_lr (float): Learning rate for policy
            q_lr (float): Learning rate for critic
            reward_scale (float): Scaling for reward (has effect of optimal
                                  policy entropy)
            hidden_dim (int): Number of hidden dimensions for networks
        """

        print(algo_config)
        # Dictionary which maps agent type to its topology
        self.agents = [AttentionAgent(sdim, adim, lr=pi_lr, hidden_dim=pol_hidden_dim) for sdim, adim in algo_config]
        self.critic = AttentionCritic(algo_config, hidden_dim=critic_hidden_dim, attend_heads=attend_heads)
        self.target_critic = AttentionCritic(algo_config, hidden_dim=critic_hidden_dim, attend_heads=attend_heads)
        hard_update(self.target_critic, self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=q_lr, weight_decay=1e-3)
        self.gamma = gamma
        self.tau = tau
        self.pi_lr = pi_lr
        self.q_lr = q_lr
        self.reward_scale = reward_scale
        self.pol_dev = 'cpu'  # device for policies
        self.critic_dev = 'cpu'  # device for critics
        self.trgt_pol_dev = 'cpu'  # device for target policies
        self.trgt_critic_dev = 'cpu'  # device for target critics
        self.niter = 0

        self.init_dict = {'gamma': gamma, 'tau': tau,
                          'pi_lr': pi_lr, 'q_lr': q_lr,
                          'reward_scale': reward_scale,
                          'pol_hidden_dim': pol_hidden_dim,
                          'critic_hidden_dim': critic_hidden_dim,
                          'attend_heads': attend_heads,
                          'algo_config': algo_config}
Exemple #8
0
    def make_moa(self,
                 hidden_dim=64,
                 lr=0.01,
                 rnn_policy=False,
                 norm_in=False,
                 constrain_out=False,
                 env_obs_space=None,
                 env_act_space=None):
        """ instantiate a policy, target and optimizer for training 
            each of the other agents, assume current agent always have 
            position 0 in env obs and act spaces
        """
        self.moa_policies = {}
        self.moa_target_policies = {}
        self.moa_optimizers = {}
        self.moa_hidden_states = {}

        for i in range(1, len(env_act_space)):
            obs_space, act_space = env_obs_space[i], env_act_space[i]

            num_in_pol = obs_space.shape[0]
            if isinstance(act_space, Dict):
                # hard specify now, could generalize later
                num_out_pol = {
                    "move": self.get_shape(act_space, "move"),
                    "comm": self.get_shape(act_space, "comm")
                }
            else:
                num_out_pol = self.get_shape(act_space)

            policy_kwargs = dict(hidden_dim=hidden_dim,
                                 norm_in=norm_in,
                                 constrain_out=constrain_out,
                                 discrete_action=self.discrete_action,
                                 rnn_policy=rnn_policy)
            policy = Policy(num_in_pol, num_out_pol, **policy_kwargs)
            target_policy = Policy(num_in_pol, num_out_pol, **policy_kwargs)
            hard_update(target_policy, policy)

            # push to moa containers
            self.moa_policies[i] = policy
            self.moa_target_policies[i] = target_policy
            self.moa_optimizers[i] = Adam(policy.parameters(), lr=lr)
            self.moa_hidden_states[i] = None
Exemple #9
0
    def __init__(self,
                 agent_init_params,
                 sa_size,
                 gamma=0.95,
                 tau=0.01,
                 pi_lr=0.01,
                 q_lr=0.01,
                 reward_scale=10.,
                 pol_hidden_dim=128,
                 critic_hidden_dim=128,
                 attend_heads=4,
                 **kwargs):
        """
        Inputs:
            agent_init_params (list of dict): List of dicts with parameters to initialize each agent
                num_in_pol (int): Input dimensions to policy
                num_out_pol (int): Output dimensions to policy
            sa_size (list of (int, int)): Size of state and action space for each agent
        """
        self.nagents = len(sa_size)

        self.agents = [
            AttentionAgent(lr=pi_lr, hidden_dim=pol_hidden_dim, **params)
            for params in agent_init_params
        ]
        self.critic = AttentionCritic(sa_size,
                                      hidden_dim=critic_hidden_dim,
                                      attend_heads=attend_heads)
        self.target_critic = AttentionCritic(sa_size,
                                             hidden_dim=critic_hidden_dim,
                                             attend_heads=attend_heads)
        hard_update(self.target_critic, self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=q_lr,
                                     weight_decay=1e-3)
        self.agent_init_params = agent_init_params
        self.gamma = gamma
        self.tau = tau
        self.pi_lr = pi_lr
        self.q_lr = q_lr
        self.reward_scale = reward_scale
        self.niter = 0
Exemple #10
0
 def __init__(self,
              num_in_pol,
              num_out_pol,
              num_in_critic,
              hidden_dim=64,
              lr=0.01,
              discrete_action=True):
     """
     Inputs:
         num_in_pol (int): number of dimensions for policy input
         num_out_pol (int): number of dimensions for policy output
         num_in_critic (int): number of dimensions for critic input
     """
     self.policy = MLPNetwork(num_in_pol,
                              num_out_pol,
                              hidden_dim=hidden_dim,
                              constrain_out=True,
                              discrete_action=discrete_action)
     self.critic = MLPNetwork(num_in_critic,
                              1,
                              hidden_dim=hidden_dim,
                              constrain_out=False)
     self.target_policy = MLPNetwork(num_in_pol,
                                     num_out_pol,
                                     hidden_dim=hidden_dim,
                                     constrain_out=True,
                                     discrete_action=discrete_action)
     self.target_critic = MLPNetwork(num_in_critic,
                                     1,
                                     hidden_dim=hidden_dim,
                                     constrain_out=False)
     hard_update(self.target_policy, self.policy)
     hard_update(self.target_critic, self.critic)
     self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
     self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
     if not discrete_action:
         self.exploration = OUNoise(num_out_pol)
     else:
         self.exploration = 0.3  # epsilon for eps-greedy
     self.discrete_action = discrete_action
Exemple #11
0
    def __init__(self,
                 obs_shape,
                 action_size,
                 hidden_dim=64,
                 lr=0.01,
                 adam_eps=1e-8,
                 nonlin=F.relu,
                 n_pol_heads=1):
        self.policy = DiscretePolicy(obs_shape,
                                     action_size,
                                     hidden_dim=hidden_dim,
                                     nonlin=nonlin,
                                     n_heads=n_pol_heads)
        self.target_policy = DiscretePolicy(obs_shape,
                                            action_size,
                                            hidden_dim=hidden_dim,
                                            nonlin=nonlin,
                                            n_heads=n_pol_heads)

        hard_update(self.target_policy, self.policy)
        self.policy_optimizer = Adam(self.policy.parameters(),
                                     lr=lr,
                                     eps=adam_eps)
Exemple #12
0
    def __init__(self,
                 nagents,
                 obs_shape,
                 state_shape,
                 action_size,
                 gamma_e=0.95,
                 gamma_i=0.95,
                 tau=0.01,
                 hard_update_interval=None,
                 pi_lr=0.01,
                 q_lr=0.01,
                 phi_lr=0.1,
                 adam_eps=1e-8,
                 q_decay=1e-3,
                 phi_decay=1e-4,
                 reward_scale=10.,
                 head_reward_scale=25.,
                 pol_hidden_dim=64,
                 critic_hidden_dim=64,
                 nonlin=F.relu,
                 n_intr_rew_types=0,
                 sep_extr_head=False,
                 beta=0.5,
                 **kwargs):
        """
        Inputs:
            obs_shape (int): Dimensions of vector observations
            state_shape (int): Dimensions of vector global state
            gamma (float): Discount factor
            tau (float): Target update rate
            pi_lr (float): Learning rate for policy
            q_lr (float): Learning rate for critic
            reward_scale (float): Scaling for reward (has effect of optimal
                                  policy entropy)
            hidden_dim (int): Number of hidden dimensions for networks
        """
        self.nagents = nagents
        n_pol_heads = n_intr_rew_types + int(sep_extr_head)

        self.agents = [
            Agent(obs_shape,
                  action_size,
                  lr=pi_lr,
                  adam_eps=adam_eps,
                  hidden_dim=pol_hidden_dim,
                  nonlin=nonlin,
                  n_pol_heads=n_pol_heads) for _ in range(nagents)
        ]

        self.critic = CentralCritic(state_shape[0],
                                    action_size,
                                    nagents,
                                    hidden_dim=critic_hidden_dim,
                                    n_intr_rew_heads=n_intr_rew_types,
                                    sep_extr_head=sep_extr_head)
        self.target_critic = CentralCritic(state_shape[0],
                                           action_size,
                                           nagents,
                                           hidden_dim=critic_hidden_dim,
                                           n_intr_rew_heads=n_intr_rew_types,
                                           sep_extr_head=sep_extr_head)
        hard_update(self.target_critic, self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=q_lr,
                                     eps=adam_eps,
                                     weight_decay=q_decay)

        self.gamma_e = gamma_e
        self.gamma_i = gamma_i
        self.tau = tau
        self.hard_update_interval = hard_update_interval
        self.pi_lr = pi_lr
        self.q_lr = q_lr
        self.reward_scale = reward_scale
        self.head_reward_scale = head_reward_scale
        self.n_intr_rew_types = n_intr_rew_types
        self.sep_extr_head = sep_extr_head  # separate policy head only trained on extr rews
        self.n_pol_heads = n_pol_heads
        self.beta = beta
        self.head_selector = HeadSelector(nagents, n_pol_heads)
        self.head_selector_optimizer = SGD(self.head_selector.parameters(),
                                           lr=phi_lr,
                                           weight_decay=phi_decay)
        self.curr_pol_heads = self.sample_pol_heads()
        self.grad_norm_clip = 10
        self.niter = 0
Exemple #13
0
    def __init__(self, agent_init_params, alg_types,
                gamma=0.95, tau=0.01, lr=0.01, hidden_dim=64,
                discrete_action=False, stochastic = False,
                commonCritic = False, gasil = False, dlr = 0.0003, lambda_disc = 0.5, batch_size_disc = 512, dynamic = False):
        """
        Inputs:
            agent_init_params (list of dict): List of dicts with parameters to
                                              initialize each agent
                num_in_pol (int): Input dimensions to policy
                num_out_pol (int): Output dimensions to policy
                num_in_critic (int): Input dimensions to critic
            alg_types (list of str): Learning algorithm for each agent (DDPG
                                       or MADDPG)
            gamma (float): Discount factor
            tau (float): Target update rate
            lr (float): Learning rate for policy and critic
            hidden_dim (int): Number of hidden dimensions for networks
            discrete_action (bool): Whether or not to use discrete action space
        """
        self.nagents = len(alg_types)
        self.alg_types = alg_types
        self.agents = [DDPGAgent(lr=lr, discrete_action=discrete_action,
                                 hidden_dim=hidden_dim,
                                 **params)
                       for params in agent_init_params]

        for i in self.agents:
            i.target_policy.requires_grad = False
        self.agent_init_params = agent_init_params
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.dlr = dlr
        self.discrete_action = discrete_action
        self.pol_dev = 'cpu'  # device for policies
        self.critic_dev = 'cpu'  # device for critics
        self.trgt_pol_dev = 'cpu'  # device for target policies
        self.trgt_critic_dev = 'cpu'  # device for target critics
        self.disc_dev = 'cpu'
        self.niter = 0
        self.stochastic = stochastic
        self.commonCritic = commonCritic
        self.gasil = gasil
        self.lambda_disc = lambda_disc
        self.batch_size_disc = batch_size_disc
        self.dynamic = dynamic
        num_in_critic = self.agent_init_params[0]['num_in_critic']
        self.cuda = True if torch.cuda.is_available() else False 
        if self.commonCritic:
            
            #num_in_discriminator = self.agent_init_params[0]['num_in_pol'] + self.agent_init_params[0]['num_out_pol']
            #This can be changed and looked at 
            
            self.critic = MLPNetwork(num_in_critic, 1,
                                 hidden_dim=hidden_dim,
                                 constrain_out=False)
            self.target_critic = MLPNetwork(num_in_critic, 1,
                                        hidden_dim=hidden_dim,
                                        constrain_out=False)
            hard_update(self.target_critic, self.critic)
            self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
        if self.gasil:
            self.discriminator = MLPNetwork_Disc(num_in_critic, 1,
                                 hidden_dim=hidden_dim, norm_in=False,
                                 constrain_out=False, discrete_action=False)
            self.discriminator_optimizer = Adam(self.discriminator.parameters(), lr=dlr)
Exemple #14
0
    def __init__(self,
                 algo_type="MADDPG",
                 act_space=None,
                 obs_space=None,
                 rnn_policy=False,
                 rnn_critic=False,
                 hidden_dim=64,
                 lr=0.01,
                 norm_in=False,
                 constrain_out=False,
                 env_obs_space=None,
                 env_act_space=None,
                 model_of_agents=False,
                 **kwargs):
        """
        Inputs:
            act_space: single agent action space (single space or Dict)
            obs_space: single agent observation space (single space Dict)
        """
        self.algo_type = algo_type
        self.act_space = act_space
        self.obs_space = obs_space

        # continuous or discrete action (only look at `move` action, assume
        # move and comm space both discrete or continuous)
        tmp = act_space.spaces["move"] if isinstance(act_space,
                                                     Dict) else act_space
        self.discrete_action = False if isinstance(tmp, Box) else True

        # Exploration noise
        if not self.discrete_action:
            # `move`, `comm` share same continuous noise source
            self.exploration = OUNoise(self.get_shape(act_space))
        else:
            self.exploration = 0.3  # epsilon for eps-greedy

        # Policy (supports multiple outputs)
        self.rnn_policy = rnn_policy
        self.policy_hidden_states = None

        num_in_pol = obs_space.shape[0]
        if isinstance(act_space, Dict):
            # hard specify now, could generalize later
            num_out_pol = {
                "move": self.get_shape(act_space, "move"),
                "comm": self.get_shape(act_space, "comm")
            }
        else:
            num_out_pol = self.get_shape(act_space)

        policy_kwargs = dict(hidden_dim=hidden_dim,
                             norm_in=norm_in,
                             constrain_out=constrain_out,
                             discrete_action=self.discrete_action,
                             rnn_policy=rnn_policy)
        self.policy = Policy(num_in_pol, num_out_pol, **policy_kwargs)
        self.target_policy = Policy(num_in_pol, num_out_pol, **policy_kwargs)
        hard_update(self.target_policy, self.policy)

        # action selector (distribution wrapper)
        if self.discrete_action:
            self.selector = DiscreteActionSelector()
        else:
            self.selector = ContinuousActionSelector()

        # Critic
        self.rnn_critic = rnn_critic
        self.critic_hidden_states = None

        if algo_type == "MADDPG":
            num_in_critic = 0
            for oobsp in env_obs_space:
                num_in_critic += oobsp.shape[0]
            for oacsp in env_act_space:
                # feed all acts to centralized critic
                num_in_critic += self.get_shape(oacsp)
        else:  # only DDPG, local critic
            num_in_critic = obs_space.shape[0] + self.get_shape(act_space)

        critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork
        critic_kwargs = dict(hidden_dim=hidden_dim,
                             norm_in=norm_in,
                             constrain_out=constrain_out)
        self.critic = critic_net_fn(num_in_critic, 1, **critic_kwargs)
        self.target_critic = critic_net_fn(num_in_critic, 1, **critic_kwargs)
        hard_update(self.target_critic, self.critic)

        # Optimizers
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)

        # model of agents (approximate models for other agents)
        self.model_of_agents = model_of_agents
        if model_of_agents:
            self.make_moa(hidden_dim=hidden_dim,
                          lr=lr,
                          rnn_policy=rnn_policy,
                          norm_in=norm_in,
                          constrain_out=constrain_out,
                          env_obs_space=env_obs_space,
                          env_act_space=env_act_space)
Exemple #15
0
    def __init__(self, algo_type="MADDPG", act_space=None, obs_space=None, 
                rnn_policy=False, rnn_critic=False, hidden_dim=64, lr=0.01,
                norm_in=False, constrain_out=False, thought_dim=64,
                env_obs_space=None, env_act_space=None, **kwargs):
        """
        Inputs:
            act_space: single agent action space (single space or Dict)
            obs_space: single agent observation space (single space Dict)
        """
        self.algo_type = algo_type
        self.act_space = act_space 
        self.obs_space = obs_space

        # continuous or discrete action (only look at `move` action, assume
        # move and comm space both discrete or continuous)
        tmp = act_space.spaces["move"] if isinstance(act_space, Dict) else act_space
        self.discrete_action = False if isinstance(tmp, Box) else True 

        # Exploration noise 
        if not self.discrete_action:
            # `move`, `comm` share same continuous noise source
            self.exploration = OUNoise(self.get_shape(act_space))
        else:
            self.exploration = 0.3  # epsilon for eps-greedy
        
        # Policy (supports multiple outputs)
        self.rnn_policy = rnn_policy
        self.policy_hidden_states = None 

        num_in_pol = obs_space.shape[0]
        if isinstance(act_space, Dict):
            # hard specify now, could generalize later 
            num_out_pol = {
                "move": self.get_shape(act_space, "move"), 
                "comm": self.get_shape(act_space, "comm")
            }
        else:
            num_out_pol = self.get_shape(act_space)

        # atoc policy 
        policy_kwargs = dict(
            hidden_dim=hidden_dim,
            norm_in=norm_in,
            constrain_out=constrain_out,
            discrete_action=self.discrete_action,
            rnn_policy=rnn_policy,
            thought_dim=thought_dim
        )
        self.policy = ATOCPolicy(num_in_pol, num_out_pol, **policy_kwargs)
        self.target_policy = ATOCPolicy(num_in_pol, num_out_pol, **policy_kwargs)
        hard_update(self.target_policy, self.policy)
        
        # Critic 
        self.rnn_critic = rnn_critic
        self.critic_hidden_states = None 
        
        if algo_type == "MADDPG":
            num_in_critic = 0
            for oobsp in env_obs_space:
                num_in_critic += oobsp.shape[0]
            for oacsp in env_act_space:
                # feed all acts to centralized critic
                num_in_critic += self.get_shape(oacsp)
        else:   # only DDPG, local critic 
            num_in_critic = obs_space.shape[0] + self.get_shape(act_space)

        critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork
        critic_kwargs = dict(
            hidden_dim=hidden_dim,
            norm_in=norm_in,
            constrain_out=constrain_out
        )
        self.critic = critic_net_fn(num_in_critic, 1, **critic_kwargs)
        self.target_critic = critic_net_fn(num_in_critic, 1, **critic_kwargs)
        hard_update(self.target_critic, self.critic)

        # NOTE: atoc modules 
        # attention unit, MLP (used here) or RNN, output comm probability
        self.thought_dim = thought_dim
        self.attention_unit = nn.Sequential(
            MLPNetwork(thought_dim, 1, hidden_dim=hidden_dim, 
                norm_in=norm_in, constrain_out=False), 
            nn.Sigmoid()
        )
        
        # communication channel, bi-LSTM (used here) or graph 
        self.comm_channel = nn.LSTM(thought_dim, thought_dim, 1, 
            batch_first=False, bidirectional=True)

        # Optimizers 
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
        self.attention_unit_optimizer = Adam(self.attention_unit.parameters(), lr=lr)
        self.comm_channel_optimizer = Adam(self.comm_channel.parameters(), lr=lr)
    def __init__(self,
                 algo_type="MADDPG",
                 act_space=None,
                 obs_space=None,
                 rnn_policy=False,
                 rnn_critic=False,
                 hidden_dim=64,
                 lr=0.01,
                 env_obs_space=None,
                 env_act_space=None):
        """
        Inputs:
            act_space: single agent action space (single space or Dict)
            obs_space: single agent observation space (single space Dict)
        """
        self.algo_type = algo_type
        self.act_space = act_space
        self.obs_space = obs_space

        # continuous or discrete action (only look at `move` action, assume
        # move and comm space both discrete or continuous)
        if isinstance(act_space, Box) or isinstance(act_space["move"], Box):
            discrete_action = False
        elif isinstance(act_space, Discrete) or isinstance(
                act_space["move"], Discrete):
            discrete_action = True
        self.discrete_action = discrete_action

        # Exploration noise
        if not discrete_action:
            # `move`, `comm` share same continuous noise source
            self.exploration = OUNoise(self.get_shape(act_space))
        else:
            self.exploration = 0.3  # epsilon for eps-greedy

        # Policy (supports multiple outputs)
        self.rnn_policy = rnn_policy
        self.policy_hidden_states = None

        num_in_pol = obs_space.shape[0]
        if isinstance(act_space, Dict):
            # hard specify now, could generalize later
            num_out_pol = {
                "move": self.get_shape(act_space, "move"),
                "comm": self.get_shape(act_space, "comm")
            }
        else:
            num_out_pol = self.get_shape(act_space)

        self.policy = Policy(num_in_pol,
                             num_out_pol,
                             hidden_dim=hidden_dim,
                             constrain_out=True,
                             discrete_action=discrete_action,
                             rnn_policy=rnn_policy)
        self.target_policy = Policy(num_in_pol,
                                    num_out_pol,
                                    hidden_dim=hidden_dim,
                                    constrain_out=True,
                                    discrete_action=discrete_action,
                                    rnn_policy=rnn_policy)
        hard_update(self.target_policy, self.policy)

        # Critic
        self.rnn_critic = rnn_critic
        self.critic_hidden_states = None

        if algo_type == "MADDPG":
            num_in_critic = 0
            for oobsp in env_observation_space:
                num_in_critic += oobsp.shape[0]
            for oacsp in env_action_space:
                # feed all acts to centralized critic
                num_in_critic += self.get_shape(oacsp)
        else:  # only DDPG, local critic
            num_in_critic = obs_space.shape[0] + self.get_shape(act_space)

        critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork
        self.critic = critic_net_fn(num_in_critic,
                                    1,
                                    hidden_dim=hidden_dim,
                                    constrain_out=False)
        self.target_critic = critic_net_fn(num_in_critic,
                                           1,
                                           hidden_dim=hidden_dim,
                                           constrain_out=False)
        hard_update(self.target_critic, self.critic)

        # Optimizers
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
Exemple #17
0
    def __init__(self,
                 agent_init_params,
                 sa_size,
                 gamma=0.95,
                 tau=0.01,
                 pi_lr=0.01,
                 q_lr=0.01,
                 reward_scale=10.,
                 pol_hidden_dim=128,
                 critic_hidden_dim=128,
                 attend_heads=4,
                 l1_reg=0.01,
                 **kwargs):
        """
        Inputs:
            agent_init_params (list of dict): List of dicts with parameters to
                                              initialize each agent
                num_in_pol (int): Input dimensions to policy
                num_out_pol (int): Output dimensions to policy
            sa_size (list of (int, int)): Size of state and action space for
                                          each agent
            gamma (float): Discount factor
            tau (float): Target update rate
            pi_lr (float): Learning rate for policy
            q_lr (float): Learning rate for critic
            reward_scale (float): Scaling for reward (has effect of optimal
                                  policy entropy)
            hidden_dim (int): Number of hidden dimensions for networks
        """
        self.nagents = len(sa_size)
        self.l1_reg = l1_reg

        self.agents = [
            AttentionAgent(lr=pi_lr, hidden_dim=pol_hidden_dim, **params)
            for params in agent_init_params
        ]

        widths = []
        hidden_layers = []
        running_width = 0

        for sdim, adim in sa_size:
            totdim = sdim + adim
            running_width += totdim
            widths.append(running_width)
            hidden_layers.append(2)

        # def __init__(self, sa_sizes, widths, hidden_layers, selector_width, selector_depth, hidden_dim=32, **kwargs):
        self.critic = SelectiveAttentionCritic(sa_size,
                                               widths=widths,
                                               hidden_layers=hidden_layers,
                                               selector_width=2,
                                               selector_depth=2)
        self.target_critic = SelectiveAttentionCritic(
            sa_size,
            widths=widths,
            hidden_layers=hidden_layers,
            selector_width=2,
            selector_depth=2)
        hard_update(self.target_critic, self.critic)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=q_lr,
                                     weight_decay=1e-3)
        self.agent_init_params = agent_init_params
        self.gamma = gamma
        self.tau = tau
        self.pi_lr = pi_lr
        self.q_lr = q_lr
        self.reward_scale = reward_scale
        self.pol_dev = 'cpu'  # device for policies
        self.critic_dev = 'cpu'  # device for critics
        self.trgt_pol_dev = 'cpu'  # device for target policies
        self.trgt_critic_dev = 'cpu'  # device for target critics
        self.niter = 0
    def __init__(self, algo_type="MASAC", act_space=None, obs_space=None, 
                rnn_policy=False, rnn_critic=False, hidden_dim=64, lr=0.01, 
                env_obs_space=None, env_act_space=None, **kwargs):
        """
        Inputs:
            act_space: single agent action space (single space or Dict)
            obs_space: single agent observation space (single space Dict)
        """
        self.algo_type = algo_type
        self.act_space = act_space 
        self.obs_space = obs_space

        # continuous or discrete action (only look at `move` action, assume
        # move and comm space both discrete or continuous)
        tmp = act_space.spaces["move"] if isinstance(act_space, Dict) else act_space
        self.discrete_action = False if isinstance(tmp, Box) else True 
        
        # Policy (supports multiple outputs)
        self.rnn_policy = rnn_policy
        self.policy_hidden_states = None 

        num_in_pol = obs_space.shape[0]
        if isinstance(act_space, Dict):
            # hard specify now, could generalize later 
            num_out_pol = {
                "move": self.get_shape(act_space, "move"), 
                "comm": self.get_shape(act_space, "comm")
            }
        else:
            num_out_pol = self.get_shape(act_space)

        self.policy = Policy(num_in_pol, num_out_pol,
                                 hidden_dim=hidden_dim,
                                #  constrain_out=True,
                                 discrete_action=self.discrete_action,
                                 rnn_policy=rnn_policy)

        # action selector (distribution wrapper)
        if self.discrete_action:
            self.selector = DiscreteActionSelector()
        else:
            self.selector = ContinuousActionSelector()

        # Critic 
        self.rnn_critic = rnn_critic
        self.critic_hidden_states = None 
        
        if algo_type == "MASAC":
            num_in_critic = 0
            for oobsp in env_obs_space:
                num_in_critic += oobsp.shape[0]
            for oacsp in env_act_space:
                # feed all acts to centralized critic
                num_in_critic += self.get_shape(oacsp)
        else:   # only DDPG, local critic 
            num_in_critic = obs_space.shape[0] + self.get_shape(act_space)

        critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork
        self.critic1 = critic_net_fn(num_in_critic, 1,
                                 hidden_dim=hidden_dim,
                                 constrain_out=False)
        self.critic2 = critic_net_fn(num_in_critic, 1,
                                 hidden_dim=hidden_dim,
                                 constrain_out=False)
        self.target_critic1 = critic_net_fn(num_in_critic, 1,
                                        hidden_dim=hidden_dim,
                                        constrain_out=False)
        self.target_critic2 = critic_net_fn(num_in_critic, 1,
                                        hidden_dim=hidden_dim,
                                        constrain_out=False)
        hard_update(self.target_critic1, self.critic1)
        hard_update(self.target_critic2, self.critic2)

        # alpha
        self.log_alpha = LogAlpha(0.0)

        # Optimizers 
        self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
        self.critic1_optimizer = Adam(self.critic1.parameters(), lr=lr)
        self.critic2_optimizer = Adam(self.critic2.parameters(), lr=lr)
        self.alpha_optimizer = Adam((self.log_alpha,), lr=lr)