Ejemplo n.º 1
0
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        input_shape = self.obs_shape
        # 根据参数决定RNN的输入维度
        if args.last_action:
            input_shape += self.n_actions
        if args.reuse_network:
            input_shape += self.n_agents
        if args.alg == 'vdn':
            self.policy = VDN(args)
        elif args.alg == 'qmix':
            # self.policy = QMIX(args)
            self.policy = rnn_policy(input_shape, args)
            if args.cuda:
                self.policy.cuda()

        # elif args.alg == 'coma':
        #     self.policy = COMA(args)
        # elif args.alg == 'qtran_alt':
        #     self.policy = QtranAlt(args)
        # elif args.alg == 'qtran_base':
        #     self.policy = QtranBase(args)
        # elif args.alg == 'maven':
        #     self.policy = MAVEN(args)
        # elif args.alg == 'central_v':
        #     self.policy = CentralV(args)
        # elif args.alg == 'reinforce':
        #     self.policy = Reinforce(args)
        # else:
        #     raise Exception("No such algorithm")
        self.args = args
        print('Init Agents')
Ejemplo n.º 2
0
 def __init__(self, args):
     self.n_actions = args.n_actions
     self.n_agents = args.n_agents
     self.state_shape = args.state_shape
     self.obs_shape = args.obs_shape
     if args.alg == 'vdn':
         self.policy = VDN(args)
     elif args.alg == 'qmix':
         self.policy = QMIX(args)
     elif args.alg == 'coma':
         self.policy = COMA(args)
     elif args.alg == 'qtran_alt':
         self.policy = QtranAlt(args)
     elif args.alg == 'qtran_base':
         self.policy = QtranBase(args)
     elif args.alg == 'maven':
         self.policy = MAVEN(args)
     elif args.alg == 'central_v':
         self.policy = CentralV(args)
     elif args.alg == 'reinforce':
         self.policy = Reinforce(args)
     else:
         raise Exception("No such algorithm")
     self.args = args
     print('Init Agents')
Ejemplo n.º 3
0
 def __init__(self, args):
     self.n_actions = args.n_actions
     self.n_agents = args.n_agents
     self.state_shape = args.state_shape
     self.obs_shape = args.obs_shape
     if args.alg == 'vdn':
         from policy.vdn import VDN
         self.policy = VDN(args)
     elif args.alg == 'iql':
         from policy.iql import IQL
         self.policy = IQL(args)
     elif args.alg == 'qmix':
         from policy.qmix import QMIX
         self.policy = QMIX(args)
     elif args.alg == 'coma':
         from policy.coma import COMA
         self.policy = COMA(args)
     elif args.alg == 'qtran_alt':
         from policy.qtran_alt import QtranAlt
         self.policy = QtranAlt(args)
     elif args.alg == 'qtran_base':
         from policy.qtran_base import QtranBase
         self.policy = QtranBase(args)
     elif args.alg == 'maven':
         from policy.maven import MAVEN
         self.policy = MAVEN(args)
     elif args.alg == 'central_v':
         from policy.central_v import CentralV
         self.policy = CentralV(args)
     elif args.alg == 'reinforce':
         from policy.reinforce import Reinforce
         self.policy = Reinforce(args)
     else:
         raise Exception("No such algorithm")
     self.args = args
Ejemplo n.º 4
0
 def __init__(self, args):
     self.n_actions = args.n_actions
     self.n_agents = args.n_agents
     self.state_shape = args.state_shape
     self.obs_shape = args.obs_shape
     if args.alg == 'vdn':
         self.policy = VDN(args)
     else:
         self.policy = QMIX(args)
     self.args = args
Ejemplo n.º 5
0
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        input_shape = self.obs_shape
        # 根据参数决定RNN的输入维度
        if args.last_action:
            input_shape += self.n_actions
        if args.reuse_network:
            input_shape += self.n_agents
        if args.alg == 'vdn':
            self.policy = VDN(args)
        elif args.alg == 'qmix':
            # self.policy = QMIX(args)
            self.policy = rnn_policy(input_shape, args)
            if args.cuda:
                self.policy.cuda()

        self.args = args
        print('Init Agents')
Ejemplo n.º 6
0
 def __init__(self, args):
     self.n_actions = args.n_actions
     self.n_agents = args.n_agents * 2
     self.state_shape = args.state_shape
     self.obs_shape = args.obs_shape
     self.idact_shape = args.id_dim + args.n_actions
     self.search_actions = np.eye(args.n_actions)
     self.search_ids = np.zeros(self.n_agents)
     if args.alg == 'vdn':
         self.policy = VDN(args)
     elif args.alg == 'qmix':
         self.policy = QMIX(args)
     elif args.alg == 'ours':
         self.policy = OURS(args)
     elif args.alg == 'coma':
         self.policy = COMA(args)
     elif args.alg == 'qtran_alt':
         self.policy = QtranAlt(args)
     elif args.alg == 'qtran_base':
         self.policy = QtranBase(args)
     elif args.alg == 'maven':
         self.policy = MAVEN(args)
     elif args.alg == 'central_v':
         self.policy = CentralV(args)
     elif args.alg == 'reinforce':
         self.policy = Reinforce(args)
     else:
         raise Exception("No such algorithm")
     if args.use_fixed_model:
         args_goal_a = get_common_args()
         args_goal_a.load_model = True
         args_goal_a = get_mixer_args(args_goal_a)
         args_goal_a.learn = False
         args_goal_a.epsilon = 0  # 1
         args_goal_a.min_epsilon = 0
         args_goal_a.map = 'battle'
         args_goal_a.n_actions = args.n_actions
         args_goal_a.episode_limit = args.episode_limit
         args_goal_a.n_agents = args.n_agents
         args_goal_a.state_shape = args.state_shape
         args_goal_a.feature_shape = args.feature_shape
         args_goal_a.view_shape = args.view_shape
         args_goal_a.obs_shape = args.obs_shape
         args_goal_a.real_view_shape = args.real_view_shape
         args_goal_a.load_num = args.load_num
         args_goal_a.use_ja = False
         args_goal_a.mlp_hidden_dim = [512, 512]
         self.fixed_policy = VDN_F(args_goal_a)
     self.args = args
     print('Init Agents')
Ejemplo n.º 7
0
 def __init__(self, args):
     self.n_actions = args.n_actions
     self.n_agents = args.n_agents
     self.state_shape = args.state_shape
     self.obs_shape = args.obs_shape
     if args.alg == 'vdn':
         self.policy = VDN(args)
     elif args.alg == 'qmix':
         self.policy = QMIX(args)
     elif args.alg == 'coma':
         self.policy = COMA(args)
     elif args.alg == 'qtran_alt':
         self.policy = QtranAlt(args)
     elif args.alg == 'qtran_base':
         self.policy = QtranBase(args)
     else:
         raise Exception("No such algorithm")
     self.args = args
Ejemplo n.º 8
0
class Agents:
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        input_shape = self.obs_shape
        # 根据参数决定RNN的输入维度
        if args.last_action:
            input_shape += self.n_actions
        if args.reuse_network:
            input_shape += self.n_agents
        if args.alg == 'vdn':
            self.policy = VDN(args)
        elif args.alg == 'qmix':
            # self.policy = QMIX(args)
            self.policy = rnn_policy(input_shape, args)
            if args.cuda:
                self.policy.cuda()

        self.args = args
        print('Init Agents')

    def init_hidden(self, episode_num):
        # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden
        self.policy_eval_hidden = torch.zeros(
            (episode_num, self.n_agents, self.args.rnn_hidden_dim))
        # self.target_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim))

    def choose_action(self,
                      obs,
                      last_action,
                      agent_num,
                      avail_actions,
                      epsilon,
                      maven_z=None,
                      evaluate=False):
        inputs = obs.copy()
        avail_actions_ind = np.nonzero(avail_actions)[
            0]  # index of actions which can be choose

        # transform agent_num to onehot vector
        agent_id = np.zeros(self.n_agents)
        agent_id[agent_num] = 1.

        if self.args.last_action:
            inputs = np.hstack((inputs, last_action))
        if self.args.reuse_network:
            inputs = np.hstack((inputs, agent_id))
        hidden_state = self.policy_eval_hidden[:, agent_num, :]

        # transform the shape of inputs from (42,) to (1,42)
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        avail_actions = torch.tensor(avail_actions,
                                     dtype=torch.float32).unsqueeze(0)
        if self.args.cuda:
            inputs = inputs.cuda()
            hidden_state = hidden_state.cuda()

        # get q value
        if self.args.alg == 'maven':
            maven_z = torch.tensor(maven_z, dtype=torch.float32).unsqueeze(0)
            if self.args.cuda:
                maven_z = maven_z.cuda()
            q_value, self.policy.eval_hidden[:,
                                             agent_num, :] = self.policy.eval_rnn(
                                                 inputs, hidden_state, maven_z)
        else:
            # q_value, self.policy.eval_hidden[:, agent_num, :] = self.policy.eval_rnn(inputs, hidden_state)
            if True in (hidden_state != hidden_state):
                import time
                time.sleep(1)
            agent_outs, self.policy_eval_hidden[:, agent_num, :] = self.policy(
                inputs, hidden_state)
            # agent_outs = torch.tanh(agent_outs)  # TODO

        # choose action from q value
        if self.args.alg == 'coma' or self.args.alg == 'central_v' or self.args.alg == 'reinforce':
            action = self._choose_action_from_softmax(q_value.cpu(),
                                                      avail_actions, epsilon,
                                                      evaluate)
        else:  # qmix
            agent_outs[avail_actions == 0.0] = -float("inf")

            #
            if evaluate:
                agent_outs = F.softmax(agent_outs / self.args.temp,
                                       dim=1)  # 概率分布
                action = agent_outs.max(dim=1, keepdim=False)[1]  # indices
            #    action = np.random.choice(np.arange(q_value.shape[-1]), 1, p=agent_outs.squeeze(0).detach().cpu().numpy())  # action是一个整数 按概率分布采样
            else:
                if True in (agent_outs != agent_outs):
                    import time
                    time.sleep(1)
                agent_outs = F.softmax(agent_outs / self.args.temp,
                                       dim=1)  #概率分布
                action = np.random.choice(
                    np.arange(agent_outs.shape[-1]),
                    1,
                    p=agent_outs.squeeze(
                        0).detach().cpu().numpy())  # action是一个整数 按概率分布采样

            # while (int(action) not in avail_actions_ind):
            #     print(f'agent_outs:{agent_outs}')
            #     print(f'{action} not in {avail_actions_ind}')
            #     # action = agent_outs.max(dim=1, keepdim=False)[1]
            #     action = np.random.choice(avail_actions_ind)

        return action

    def _choose_action_from_softmax(self,
                                    inputs,
                                    avail_actions,
                                    epsilon,
                                    evaluate=False):
        """
        :param inputs: # q_value of all actions
        """
        action_num = avail_actions.sum(dim=1, keepdim=True).float().repeat(
            1, avail_actions.shape[-1])  # num of avail_actions
        # 先将Actor网络的输出通过softmax转换成概率分布
        prob = torch.nn.functional.softmax(inputs, dim=-1)
        # add noise of epsilon
        prob = ((1 - epsilon) * prob +
                torch.ones_like(prob) * epsilon / action_num)
        prob[avail_actions == 0] = 0.0  # 不能执行的动作概率为0
        """
        不能执行的动作概率为0之后,prob中的概率和不为1,这里不需要进行正则化,因为torch.distributions.Categorical
        会将其进行正则化。要注意在训练的过程中没有用到Categorical,所以训练时取执行的动作对应的概率需要再正则化。
        """

        if epsilon == 0 and evaluate:
            action = torch.argmax(prob)
        else:
            action = Categorical(prob).sample().long()
        return action

    def _get_max_episode_len(self, batch):
        terminated = batch['terminated']
        episode_num = terminated.shape[0]
        max_episode_len = 0
        for episode_idx in range(episode_num):
            for transition_idx in range(self.args.episode_limit):
                if terminated[episode_idx, transition_idx, 0] == 1:
                    if transition_idx + 1 >= max_episode_len:
                        max_episode_len = transition_idx + 1
                    break
        return max_episode_len

    def train(self,
              batch,
              train_step,
              epsilon=None):  # coma needs epsilon for training

        # different episode has different length, so we need to get max length of the batch
        max_episode_len = self._get_max_episode_len(batch)
        for key in batch.keys():
            if key != 'z':
                batch[key] = batch[key][:, :max_episode_len]
        self.policy.learn(batch, max_episode_len, train_step, epsilon)
        if train_step > 0 and train_step % self.args.save_cycle == 0:
            self.policy.save_model(train_step)