コード例 #1
0
ファイル: central_v.py プロジェクト: rainwangphy/StarCraft
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        actor_input_shape = self.obs_shape  # actor网络输入的维度,和vdn、qmix的rnn输入维度一样,使用同一个网络结构
        critic_input_shape = self.state_shape  # critic网络输入的维度
        # 根据参数决定RNN的输入维度
        if args.last_action:
            actor_input_shape += self.n_actions
        if args.reuse_network:
            actor_input_shape += self.n_agents
        self.args = args

        # 神经网络
        # 每个agent选动作的网络,输出当前agent所有动作对应的概率,用该概率选动作的时候还需要用softmax再运算一次。
        if self.args.alg == 'central_v':
            self.eval_rnn = RNN(actor_input_shape, args)
            print('Init alg central_v')
        elif self.args.alg == 'central_v+commnet':
            self.eval_rnn = CommNet(actor_input_shape, args)
            print('Init alg central_v+commnet')
        elif self.args.alg == 'central_v+g2anet':
            print('Init alg central_v+g2anet')
            self.eval_rnn = G2ANet(actor_input_shape, args)
        else:
            raise Exception("No such algorithm")

        self.eval_critic = Critic(critic_input_shape, self.args)
        self.target_critic = Critic(critic_input_shape, self.args)

        if self.args.cuda:
            self.eval_rnn.cuda()
            self.eval_critic.cuda()
            self.target_critic.cuda()

        self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map
        # 如果存在模型则加载模型
        if self.args.load_model:
            if os.path.exists(self.model_dir + '/rnn_params.pkl'):
                path_rnn = self.model_dir + '/rnn_params.pkl'
                path_critic = self.model_dir + '/critic_params.pkl'
                map_location = 'cuda:0' if self.args.cuda else 'cpu'
                self.eval_rnn.load_state_dict(
                    torch.load(path_rnn, map_location=map_location))
                self.eval_critic.load_state_dict(
                    torch.load(path_critic, map_location=map_location))
                print('Successfully load the model: {} and {}'.format(
                    path_rnn, path_critic))
            else:
                raise Exception("No model!")

        # 让target_net和eval_net的网络参数相同
        self.target_critic.load_state_dict(self.eval_critic.state_dict())

        self.rnn_parameters = list(self.eval_rnn.parameters())
        self.critic_parameters = list(self.eval_critic.parameters())

        if args.optimizer == "RMS":
            self.critic_optimizer = torch.optim.RMSprop(self.critic_parameters,
                                                        lr=args.lr_critic)
            self.rnn_optimizer = torch.optim.RMSprop(self.rnn_parameters,
                                                     lr=args.lr_actor)
        self.args = args

        # 执行过程中,要为每个agent都维护一个eval_hidden
        # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden
        self.eval_hidden = None
コード例 #2
0
ファイル: central_v.py プロジェクト: rainwangphy/StarCraft
class CentralV:
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        actor_input_shape = self.obs_shape  # actor网络输入的维度,和vdn、qmix的rnn输入维度一样,使用同一个网络结构
        critic_input_shape = self.state_shape  # critic网络输入的维度
        # 根据参数决定RNN的输入维度
        if args.last_action:
            actor_input_shape += self.n_actions
        if args.reuse_network:
            actor_input_shape += self.n_agents
        self.args = args

        # 神经网络
        # 每个agent选动作的网络,输出当前agent所有动作对应的概率,用该概率选动作的时候还需要用softmax再运算一次。
        if self.args.alg == 'central_v':
            self.eval_rnn = RNN(actor_input_shape, args)
            print('Init alg central_v')
        elif self.args.alg == 'central_v+commnet':
            self.eval_rnn = CommNet(actor_input_shape, args)
            print('Init alg central_v+commnet')
        elif self.args.alg == 'central_v+g2anet':
            print('Init alg central_v+g2anet')
            self.eval_rnn = G2ANet(actor_input_shape, args)
        else:
            raise Exception("No such algorithm")

        self.eval_critic = Critic(critic_input_shape, self.args)
        self.target_critic = Critic(critic_input_shape, self.args)

        if self.args.cuda:
            self.eval_rnn.cuda()
            self.eval_critic.cuda()
            self.target_critic.cuda()

        self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map
        # 如果存在模型则加载模型
        if self.args.load_model:
            if os.path.exists(self.model_dir + '/rnn_params.pkl'):
                path_rnn = self.model_dir + '/rnn_params.pkl'
                path_critic = self.model_dir + '/critic_params.pkl'
                map_location = 'cuda:0' if self.args.cuda else 'cpu'
                self.eval_rnn.load_state_dict(
                    torch.load(path_rnn, map_location=map_location))
                self.eval_critic.load_state_dict(
                    torch.load(path_critic, map_location=map_location))
                print('Successfully load the model: {} and {}'.format(
                    path_rnn, path_critic))
            else:
                raise Exception("No model!")

        # 让target_net和eval_net的网络参数相同
        self.target_critic.load_state_dict(self.eval_critic.state_dict())

        self.rnn_parameters = list(self.eval_rnn.parameters())
        self.critic_parameters = list(self.eval_critic.parameters())

        if args.optimizer == "RMS":
            self.critic_optimizer = torch.optim.RMSprop(self.critic_parameters,
                                                        lr=args.lr_critic)
            self.rnn_optimizer = torch.optim.RMSprop(self.rnn_parameters,
                                                     lr=args.lr_actor)
        self.args = args

        # 执行过程中,要为每个agent都维护一个eval_hidden
        # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden
        self.eval_hidden = None

    def learn(self, batch, max_episode_len, train_step,
              epsilon):  # train_step表示是第几次学习,用来控制更新target_net网络的参数
        episode_num = batch['o'].shape[0]
        self.init_hidden(episode_num)
        for key in batch.keys():  # 把batch里的数据转化成tensor
            if key == 'u':
                batch[key] = torch.tensor(batch[key], dtype=torch.long)
            else:
                batch[key] = torch.tensor(batch[key], dtype=torch.float32)
        u, r, avail_u, terminated = batch['u'], batch['r'], batch[
            'avail_u'], batch['terminated']
        mask = (1 - batch["padded"].float()).repeat(
            1, 1, self.n_agents)  # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习
        if self.args.cuda:
            u = u.cuda()
            mask = mask.cuda()

        # 训练critic网络,并且得到每条经验的td_error, (episode_num, max_episode_len, 1)
        td_error = self._train_critic(batch, max_episode_len, train_step)
        td_error = td_error.repeat(1, 1, self.n_agents)

        # 每个agent的所有动作的概率 (episode_num, max_episode_len, n_agents,n_actions)
        action_prob = self._get_action_prob(batch, max_episode_len, epsilon)

        # 每个agent的选择的动作对应的概率 (episode_num, max_episode_len, n_agents)
        pi_taken = torch.gather(action_prob, dim=3, index=u).squeeze(3)
        pi_taken[mask ==
                 0] = 1.0  # 因为要取对数,对于那些填充的经验,所有概率都为0,取了log就是负无穷了,所以让它们变成1
        log_pi_taken = torch.log(pi_taken)

        # loss函数,(episode_num, max_episode_len, n_agents)
        loss = -((td_error.detach() * log_pi_taken) * mask).sum() / mask.sum()
        self.rnn_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.rnn_parameters,
                                       self.args.grad_norm_clip)
        self.rnn_optimizer.step()
        # print('Actor loss is', loss)

    def _get_v_values(self, batch, max_episode_len):
        v_evals, v_targets = [], []
        for transition_idx in range(max_episode_len):
            inputs, inputs_next = batch['s'][:, transition_idx], batch[
                's_next'][:, transition_idx],
            if self.args.cuda:
                inputs = inputs.cuda()
                inputs_next = inputs_next.cuda()
            # 神经网络输入的是(episode_num , state_shape)二维数据,得到的是(episode_num , 1)二维数据
            v_eval = self.eval_critic(inputs)
            v_target = self.target_critic(inputs_next)

            v_evals.append(v_eval)
            v_targets.append(v_target)
        v_evals = torch.stack(v_evals,
                              dim=1)  # (episode_num, max_episode_len, 1)
        v_targets = torch.stack(v_targets, dim=1)
        return v_evals, v_targets

    def _get_actor_inputs(self, batch, transition_idx):
        # 取出所有episode上该transition_idx的经验,u_onehot要取出所有,因为要用到上一条
        obs, u_onehot = batch['o'][:, transition_idx], batch['u_onehot'][:]
        episode_num = obs.shape[0]
        inputs = []
        inputs.append(obs)
        # 给inputs添加上一个动作、agent编号

        if self.args.last_action:
            if transition_idx == 0:  # 如果是第一条经验,就让前一个动作为0向量
                inputs.append(torch.zeros_like(u_onehot[:, transition_idx]))
            else:
                inputs.append(u_onehot[:, transition_idx - 1])
        if self.args.reuse_network:
            # 因为当前的inputs三维的数据,每一维分别代表(episode编号,agent编号,inputs维度),直接在dim_1上添加对应的向量
            # 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的
            # agent编号恰好就是一个单位矩阵,即对角线为1,其余为0
            inputs.append(
                torch.eye(self.args.n_agents).unsqueeze(0).expand(
                    episode_num, -1, -1))
        # 要把inputs中的三个拼起来,并且要把episode_num个episode、self.args.n_agents个agent的数据拼成40条(40,96)的数据,
        # 因为这里所有agent共享一个神经网络,每条数据中带上了自己的编号,所以还是自己的数据
        inputs = torch.cat(
            [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs],
            dim=1)
        return inputs

    def _get_action_prob(self, batch, max_episode_len, epsilon):
        episode_num = batch['o'].shape[0]
        avail_actions = batch[
            'avail_u']  # coma不用target_actor,所以不需要最后一个obs的下一个可执行动作
        action_prob = []
        for transition_idx in range(max_episode_len):
            inputs = self._get_actor_inputs(
                batch, transition_idx)  # 给obs加last_action、agent_id
            if self.args.cuda:
                inputs = inputs.cuda()
                self.eval_hidden = self.eval_hidden.cuda()
            outputs, self.eval_hidden = self.eval_rnn(
                inputs, self.eval_hidden
            )  # inputs维度为(40,96),得到的q_eval维度为(40,n_actions)
            # 把q_eval维度重新变回(8, 5,n_actions)
            outputs = outputs.view(episode_num, self.n_agents, -1)
            prob = torch.nn.functional.softmax(outputs, dim=-1)
            action_prob.append(prob)
        # 得的action_prob是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions)
        # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组
        action_prob = torch.stack(action_prob, dim=1).cpu()

        action_num = avail_actions.sum(dim=-1, keepdim=True).float().repeat(
            1, 1, 1, avail_actions.shape[-1])  # 可以选择的动作的个数
        action_prob = ((1 - epsilon) * action_prob +
                       torch.ones_like(action_prob) * epsilon / action_num)
        action_prob[avail_actions == 0] = 0.0  # 不能执行的动作概率为0

        # 因为上面把不能执行的动作概率置为0,所以概率和不为1了,这里要重新正则化一下。执行过程中Categorical会自己正则化。
        action_prob = action_prob / action_prob.sum(dim=-1, keepdim=True)
        # 因为有许多经验是填充的,它们的avail_actions都填充的是0,所以该经验上所有动作的概率都为0,在正则化的时候会得到nan。
        # 因此需要再一次将该经验对应的概率置为0
        action_prob[avail_actions == 0] = 0.0
        if self.args.cuda:
            action_prob = action_prob.cuda()
        return action_prob

    def init_hidden(self, episode_num):
        # 为每个episode中的每个agent都初始化一个eval_hidden
        self.eval_hidden = torch.zeros(
            (episode_num, self.n_agents, self.args.rnn_hidden_dim))

    def _train_critic(self, batch, max_episode_len, train_step):
        r, terminated = batch['r'], batch['terminated']
        mask = (1 - batch["padded"].float()).repeat(
            1, 1, self.n_agents)  # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习
        if self.args.cuda:
            mask = mask.cuda()
            r = r.cuda()
            terminated = terminated.cuda()
        v_evals, v_next_target = self._get_v_values(batch, max_episode_len)

        targets = r + self.args.gamma * v_next_target * (1 - terminated)
        td_error = targets.detach() - v_evals
        masked_td_error = mask * td_error  # 抹掉填充的经验的td_error

        # 不能直接用mean,因为还有许多经验是没用的,所以要求和再比真实的经验数,才是真正的均值
        loss = (masked_td_error**2).sum() / mask.sum()
        # print('Critic Loss is ', loss)
        self.critic_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_parameters,
                                       self.args.grad_norm_clip)
        self.critic_optimizer.step()
        if train_step > 0 and train_step % self.args.target_update_cycle == 0:
            self.target_critic.load_state_dict(self.eval_critic.state_dict())
        return td_error

    def save_model(self, train_step):
        num = str(train_step // self.args.save_cycle)
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        torch.save(self.eval_critic.state_dict(),
                   self.model_dir + '/' + num + '_critic_params.pkl')
        torch.save(self.eval_rnn.state_dict(),
                   self.model_dir + '/' + num + '_rnn_params.pkl')