class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']

        self.__num_of_agents = self.env.observation_space.shape[0]
        state_size = self.env.observation_space.shape[1]
        action_size = self.env.action_space_size
        agent_params['num_of_agents'] = self.__num_of_agents
        agent_params['state_size'] = state_size
        agent_params['action_size'] = action_size
        self.agents = Agents(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        self.exploration_noise = UOProcess()

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        self.sigma = 0.5

        print("MADDPG agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        logging.info("Training:")
        reward_window = deque(maxlen=100)

        for episode_i in range(1, num_of_episodes):

            states = self.env.reset()
            self.agents.reset(self.sigma)
            scores = np.zeros(self.env.observation_space.shape[0])
            total_loss = 0

            self.sigma *= 0.99

            counter = 0
            for t in range(self.t_max):

                actions = self.agents.choose_action(states)
                next_states, rewards, dones, _ = self.env.step(actions)
                self.agents.step(states, actions, rewards, next_states, dones)
                states = next_states

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}".
                #              format(episode_i, reward, counter, action))

                total_loss += self.agents.agent_loss
                scores += rewards
                counter += 1
                if any(dones):
                    break

            reward_window.append(np.max(scores))
            self.avg_rewards.append(np.mean(np.array(reward_window)))
            print(
                '\rEpisode {}\tCurrent Score: {:.4f}\tAverage Score: {:.4f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.max(scores), np.mean(reward_window),
                        total_loss, self.agents.learning_rate_actor,
                        self.agents.learning_rate_critic),
                end="")

            logging.info(
                'Episode {}\tCurrent Score: {:.4f}\tAverage Score (over episodes): {:.4f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actors): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.max(scores), np.mean(reward_window),
                        total_loss, self.agents.learning_rate_actor,
                        self.agents.learning_rate_critic))

            self.agents.learning_rate_actor *= self.learning_rate_decay
            self.agents.learning_rate_critic *= self.learning_rate_decay
            self.agents.set_learning_rate(self.agents.learning_rate_actor,
                                          self.agents.learning_rate_critic)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))

                if avg_reward >= 1.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    if not os.path.exists(self.model_path):
                        os.makedirs(self.model_path)

                    t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
                    torch.save(
                        self.agents.get_actor()[0].state_dict(),
                        self.model_path + 'checkpoint_actor1_{}.pth'.format(t))
                    torch.save(
                        self.agents.get_actor()[1].state_dict(),
                        self.model_path + 'checkpoint_actor2_{}.pth'.format(t))
                    torch.save(
                        self.agents.get_critic().state_dict(),
                        self.model_path + 'checkpoint_critic_{}.pth'.format(t))
                    np.array(self.avg_rewards).dump(
                        self.results_path + 'average_rewards_{}.dat'.format(t))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_{}.dat'.format(t))

    def test(self,
             checkpoint_actor1_filename,
             checkpoint_actor2_filename,
             checkpoint_critic_filename,
             time_span=10):
        checkpoint_actor1_path = self.model_path + checkpoint_actor1_filename
        checkpoint_actor2_path = self.model_path + checkpoint_actor2_filename
        checkpoint_critic_path = self.model_path + checkpoint_critic_filename
        self.agents.get_actor()[0].load_state_dict(
            torch.load(checkpoint_actor1_path))
        self.agents.get_actor()[1].load_state_dict(
            torch.load(checkpoint_actor2_path))
        self.agents.get_critic().load_state_dict(
            torch.load(checkpoint_critic_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            #done = False

            while True:
                action = self.agents.choose_action(state, 'test')
                state, reward, done, _ = self.env.step(action)
                self.score += np.array(np.max(reward))
                if any(done):
                    break

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)
Exemple #2
0
class Runner:
    def __init__(self, env, args, itr, seed):
        # 随机设置种子
        if seed is not None:
            self.setup_seed(seed)
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0
        self.time_steps = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \
                  str(self.args.target_update_period)
        self.alg_tag = '_' + self.args.optim

        if self.args.her:
            self.alg_tag += str(self.args.her)
            alg_dir += '_her=' + str(self.args.her)

        # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr
        self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr

        self.agents = Agents(args, itr=itr)
        print('step runner 初始化')
        if self.args.her:
            print('使用HER')

    @staticmethod
    def setup_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True

    def generate_episode(self, episode_num, evaluate=False):
        # 为保存评价的回放做准备
        if self.args.replay_dir != '' and evaluate and episode_num == 0:
            self.env.close()
        # 变量初始化,使用her需要记录goal
        self.env.reset()
        done = False
        info = None
        win = False

        last_action = np.zeros((self.args.n_agents, self.args.n_actions))
        # epsilon 递减
        epsilon = 0 if evaluate else self.args.epsilon
        # epsilon 递减的方式
        if self.args.epsilon_anneal_scale == 'episode' or \
                (self.args.epsilon_anneal_scale == 'itr' and episode_num == 0):
            epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 记录一个episode的信息
        episode_buffer = None
        if not evaluate:
            episode_buffer = {
                'o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                's':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'a':
                np.zeros([self.args.episode_limit, self.args.n_agents, 1]),
                'onehot_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'r':
                np.zeros([self.args.episode_limit, 1]),
                'next_o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                'next_s':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'next_avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'done':
                np.ones([self.args.episode_limit, 1]),
                'padded':
                np.ones([self.args.episode_limit, 1])
            }
        # 开始进行一波 episode
        states, former_states = [], []
        obs = self.env.get_obs()
        if self.args.her:
            obs = np.concatenate((obs, self.env.goal), axis=1)
        state = self.env.get_state()
        if self.args.her:
            states.append(self.env.state)
            former_states.append(self.env.former_states)
        avail_actions = []
        self.agents.policy.init_hidden(1)
        for agent_id in range(self.args.n_agents):
            avail_action = self.env.get_avail_agent_actions(agent_id)
            avail_actions.append(avail_action)

        episode_reward = 0
        for step in range(self.args.episode_limit):
            if done:
                break
            else:
                actions, onehot_actions = [], []
                for agent_id in range(self.args.n_agents):
                    # avail_action = self.env.get_avail_agent_actions(agent_id)
                    action, _ = self.agents.choose_action(
                        obs[agent_id], last_action[agent_id], agent_id,
                        avail_actions[agent_id], epsilon, evaluate)
                    # 得到该动作的独热编码
                    onehot_action = np.zeros(self.args.n_actions)
                    onehot_action[action] = 1
                    onehot_actions.append(onehot_action)
                    # 加入联合动作
                    actions.append(action)
                    # avail_actions.append(avail_action)
                    # 记录该动作
                    last_action[agent_id] = onehot_action
                # 对环境执行联合动作
                reward, done, info = self.env.step(actions)
                # 记录时间步
                if not evaluate:
                    self.time_steps += 1
                # 获取改变后的信息
                if not done:
                    next_obs = self.env.get_obs()
                    if self.args.her:
                        next_obs = np.concatenate((next_obs, self.env.goal),
                                                  axis=1)
                    next_state = self.env.get_state()
                    if self.args.her:
                        states.append(self.env.state)
                        former_states.append(self.env.former_states)
                else:
                    next_obs = obs
                    next_state = state
                # 添加可得动作
                next_avail_actions = []
                for agent_id in range(self.args.n_agents):
                    avail_action = self.env.get_avail_agent_actions(agent_id)
                    next_avail_actions.append(avail_action)
                # 添加经验
                if not evaluate:
                    episode_buffer['o'][step] = obs
                    episode_buffer['s'][step] = state
                    episode_buffer['a'][step] = np.reshape(
                        actions, [self.args.n_agents, 1])
                    episode_buffer['onehot_a'][step] = onehot_actions
                    episode_buffer['avail_a'][step] = avail_actions
                    episode_buffer['r'][step] = [reward]
                    episode_buffer['next_o'][step] = next_obs
                    episode_buffer['next_s'][step] = next_state
                    episode_buffer['next_avail_a'][step] = next_avail_actions
                    episode_buffer['done'][step] = [done]
                    episode_buffer['padded'][step] = [0.]

                # 更新变量
                episode_reward += reward
                obs = next_obs
                state = next_state
                avail_actions = next_avail_actions
                if self.args.epsilon_anneal_scale == 'step':
                    epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 是训练则记录新的epsilon
        if not evaluate:
            self.args.epsilon = epsilon
        # 获取对局信息
        if info.__contains__('battle_won'):
            win = True if done and info['battle_won'] else False
        if evaluate and episode_num == self.args.evaluate_num - 1 and self.args.replay_dir != '':
            self.env.save_replay()
            self.env.close()
        if not evaluate and self.args.her:
            return episode_buffer, states, former_states
        return episode_buffer, episode_reward, win

    def run(self):
        train_steps = 0
        early_stop = 10
        num_eval = 0
        self.max_win_rate = 0
        self.time_steps = 0
        last_test_step = 0
        begin_time = None
        begin_step = None

        # for itr in range(self.args.n_itr):
        while self.time_steps < self.args.max_steps:
            if begin_step is None:
                begin_time = datetime.utcnow().astimezone(
                    timezone(timedelta(hours=8)))
                begin_step = self.time_steps
            # 收集 n_episodes 的数据
            if self.args.her:
                episode_batch, states, former_states = self.generate_episode(0)
                self.her_k(episode_batch, states, former_states)
            else:
                episode_batch, _, _ = self.generate_episode(0)
            for key in episode_batch.keys():
                episode_batch[key] = np.array([episode_batch[key]])
            for e in range(1, self.args.n_episodes):
                if self.args.her:
                    episode_batch, states, former_states = self.generate_episode(
                        e)
                    self.her_k(episode_batch, states, former_states)
                else:
                    episode, _, _ = self.generate_episode(e)

                for key in episode_batch.keys():
                    episode[key] = np.array([episode[key]])
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)

            # 添加到 replay buffer
            self.replay_buffer.store(episode_batch)
            # 训练 TODO 12.5
            if self.replay_buffer.size < self.args.batch_size * self.args.bs_rate:
                print('replay buffer 还没 batch size * {} 大 !'.format(
                    self.args.bs_rate))
                begin_time = None
                begin_step = None
                continue
            for _ in range(self.args.train_steps):
                batch = self.replay_buffer.sample(self.args.batch_size)
                self.agents.train(batch, train_steps)
                train_steps += 1
            # 周期性评价
            # if itr % self.args.evaluation_period == 0:
            if (self.time_steps -
                    last_test_step) / self.args.evaluation_steps_period >= 1.0:
                num_eval += 1
                last_test_step = self.time_steps
                print(
                    f'进程 {self.pid}: {self.time_steps} step / {self.args.max_steps} steps'
                )
                # print('幂为:{}'.format(self.agents.policy.power))
                win_rate, episodes_reward = self.evaluate()
                # 保存测试结果
                self.evaluate_itr.append(self.time_steps)
                self.win_rates.append(win_rate)
                self.episodes_rewards.append(episodes_reward)
                # 表现好的模型要额外保存
                if win_rate > self.max_win_rate:
                    self.max_win_rate = win_rate
                    self.agents.policy.save_model(str(win_rate))
                # 不时刻保存,从而减少时间花费
                if num_eval % 50 == 0:
                    self.save_results()
                    self.plot()
                    # 记录经历50次测试花费了多久
                    now = datetime.utcnow().astimezone(
                        timezone(timedelta(hours=8)))
                    elapsed_time = now - begin_time
                    expected_remain_time = (elapsed_time / (self.time_steps - begin_step)) * \
                                           (self.args.max_steps - self.time_steps)
                    expected_end_time = now + expected_remain_time
                    print("预计还需: {}".format(str(expected_remain_time)))
                    print("预计结束时间为: {}".format(
                        expected_end_time.strftime("%Y-%m-%d_%H-%M-%S")))
        # 最后把所有的都保存一下
        self.save_results()
        self.plot()
        self.env.close()

    def evaluate(self):
        """
        得到平均胜率和每次测试的累加奖赏,方便画误差阴影图
        :return:
        """
        win_number = 0
        episodes_reward = []
        for itr in range(self.args.evaluate_num):
            if self.args.didactic:
                episode_reward, win = self.get_eval_qtot()
            else:
                _, episode_reward, win = self.generate_episode(itr,
                                                               evaluate=True)
            episodes_reward.append(episode_reward)
            if win:
                win_number += 1
        return win_number / self.args.evaluate_num, episodes_reward

    def save_results(self):
        """
        保存数据,方便后面多种算法结果画在一张图里比较
        :return:
        """
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.npy'):
                os.remove(self.save_path + '/' + filename)
        np.save(self.save_path + '/evaluate_itr.npy', self.evaluate_itr)
        if self.args.didactic and self.args.power is None and 'strapped' in self.args.alg:
            np.save(self.save_path + '/train_steps.npy',
                    self.agents.policy.train_steps)
            np.save(self.save_path + '/differences.npy',
                    self.agents.policy.differences)
        else:
            np.save(self.save_path + '/win_rates.npy', self.win_rates)
        np.save(self.save_path + '/episodes_rewards.npy',
                self.episodes_rewards)

    def plot(self):
        """
        定期绘图
        :return:
        """
        fig = plt.figure()
        ax1 = fig.add_subplot(211)
        if self.args.didactic and self.args.power is None and 'strapped' in self.args.alg:
            win_x = np.array(self.agents.policy.train_steps)[:,
                                                             None] / 1000000.
            win_y = np.array(self.agents.policy.differences)[:, None]
            plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1),
                                    columns=['T (mil)', self.args.which_diff])
            sns.lineplot(x="T (mil)",
                         y=self.args.which_diff,
                         data=plot_win,
                         ax=ax1)
        else:
            win_x = np.array(self.evaluate_itr)[:, None] / 1000000.
            win_y = np.array(self.win_rates)[:, None]
            plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1),
                                    columns=['T (mil)', 'Test Win'])
            sns.lineplot(x="T (mil)", y="Test Win", data=plot_win, ax=ax1)

        ax2 = fig.add_subplot(212)
        reward_x = np.repeat(self.evaluate_itr,
                             self.args.evaluate_num)[:, None] / 1000000.
        reward_y = np.array(self.episodes_rewards).flatten()[:, None]
        plot_reward = pd.DataFrame(np.concatenate((reward_x, reward_y),
                                                  axis=1),
                                   columns=['T (mil)', 'Median Test Returns'])
        sns.lineplot(x="T (mil)",
                     y="Median Test Returns",
                     data=plot_reward,
                     ax=ax2,
                     ci='sd',
                     estimator=np.median)
        plt.tight_layout()
        # 格式化成2016-03-20-11_45_39形式
        # tag = self.args.alg + '-' + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
        tag = self.args.alg + '_' + str(self.args.target_update_period)
        # if 'averaged' in self.args.alg:
        tag += (self.alg_tag + '_' + datetime.utcnow().astimezone(
            timezone(timedelta(hours=8))).strftime("%Y-%m-%d_%H-%M-%S"))
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.png'):
                os.remove(self.save_path + '/' + filename)
        fig.savefig(self.save_path + "/%s.png" % tag)
        plt.close()

    def get_eval_qtot(self):
        """
        得到eval qtot
        """
        self.env.reset()

        all_last_action = np.zeros((self.args.n_agents, self.args.n_actions))

        # 开始进行一波 episode
        all_obs = self.env.get_obs()
        state = self.env.get_state()
        avail_actions = []
        self.agents.policy.init_hidden(1)
        eval_qs = []
        actions = []
        one_hot_actions = []
        hidden_evals = None
        for agent_idx in range(self.args.n_agents):
            obs = all_obs[agent_idx]
            last_action = all_last_action[agent_idx]
            avail_action = self.env.get_avail_agent_actions(agent_idx)
            avail_actions.append(avail_action)

            onehot_agent_idx = np.zeros(self.args.n_agents)
            onehot_agent_idx[agent_idx] = 1.
            if self.args.last_action:
                # 在水平方向上平铺
                obs = np.hstack((obs, last_action))
            if self.args.reuse_network:
                obs = np.hstack((obs, onehot_agent_idx))
            hidden_state = self.agents.policy.eval_hidden[:, agent_idx, :]
            # 转置
            obs = torch.Tensor(obs).unsqueeze(0)
            # 是否使用 GPU
            if self.args.cuda:
                obs = obs.cuda()
                hidden_state = hidden_state.cuda()
            # 获取 Q(s, a)
            qsa, hidden_eval = self.agents.policy.eval_rnn(obs, hidden_state)
            qsa[avail_action == 0.0] = -float("inf")

            eval_qs.append(torch.max(qsa))

            action = torch.argmax(qsa)
            actions.append(action)

            onehot_action = np.zeros(self.args.n_actions)
            onehot_action[action] = 1
            one_hot_actions.append(onehot_action)
            if hidden_evals is None:
                hidden_evals = hidden_eval
            else:
                hidden_evals = torch.cat([hidden_evals, hidden_eval], dim=0)

        s = torch.Tensor(state)
        eval_qs = torch.Tensor(eval_qs).unsqueeze(0)
        actions = torch.Tensor(actions).unsqueeze(0)
        one_hot_actions = torch.Tensor(one_hot_actions).unsqueeze(0)
        hidden_evals = hidden_evals.unsqueeze(0)
        # 是否使用GPU
        if self.args.cuda:
            s = s.cuda()
            eval_qs = eval_qs.cuda()
            actions = actions.cuda()
            one_hot_actions = one_hot_actions.cuda()
            hidden_evals = hidden_evals.cuda()
        # 计算Q_tot
        eval_q_total = None
        if self.args.alg == 'qatten':
            eval_q_total, _, _ = self.agents.policy.eval_mix_net(
                eval_qs, s, actions)
        elif self.args.alg == 'qmix' \
                or 'wqmix' in self.args.alg \
                or 'strapped' in self.args.alg:
            eval_q_total = self.agents.policy.eval_mix_net(eval_qs, s)
        elif 'dmaq' in self.args.alg:
            if self.args.alg == "dmaq_qatten":
                ans_chosen, _, _ = self.agents.policy.mixer(eval_qs,
                                                            s,
                                                            is_v=True)
                ans_adv, _, _ = self.agents.policy.mixer(
                    eval_qs,
                    s,
                    actions=one_hot_actions,
                    max_q_i=eval_qs,
                    is_v=False)
                eval_q_total = ans_chosen + ans_adv
            else:
                ans_chosen = self.agents.policy.mixer(eval_qs, s, is_v=True)
                ans_adv = self.agents.policy.mixer(eval_qs,
                                                   s,
                                                   actions=one_hot_actions,
                                                   max_q_i=eval_qs,
                                                   is_v=False)
                eval_q_total = ans_chosen + ans_adv
        elif self.args.alg == 'qtran_base':
            one_hot_actions = one_hot_actions.unsqueeze(0)
            hidden_evals = hidden_evals.unsqueeze(0)
            eval_q_total = self.agents.policy.eval_joint_q(
                s, hidden_evals, one_hot_actions)

        eval_q_total = eval_q_total.squeeze().item()
        return eval_q_total, 0

    def her_k(self, episode, states, former_states):
        import copy
        for _ in range(self.args.her):
            episode_buffer = {
                'o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                's':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'a':
                np.zeros([self.args.episode_limit, self.args.n_agents, 1]),
                'onehot_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'r':
                np.zeros([self.args.episode_limit, 1]),
                'next_o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                'next_s':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'next_avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'done':
                np.ones([self.args.episode_limit, 1]),
                'padded':
                np.ones([self.args.episode_limit, 1])
            }
            # 重新生成goal,order等信息
            self.env.reset()
            # 使用新生成的goals重构整个episode
            for i in range(len(episode)):
                reward = self.env.get_reward(states[i], former_states[i])
                done = episode['done'][i]
                if reward >= 0:
                    reward = 0
                    done = True
                episode_buffer['o'][i] = episode['o'][i]
                episode_buffer['o'][i, :, -2:] = np.array(self.env.goal)[:]
                episode_buffer['s'][i] = episode['s'][i]
                episode_buffer['a'][i] = episode['a'][i]
                episode_buffer['onehot_a'][i] = episode['onehot_a'][i]
                episode_buffer['avail_a'][i] = episode['avail_a'][i]
                episode_buffer['r'][i] = [reward]
                episode_buffer['next_o'][i] = episode['next_o'][i]
                episode_buffer['next_o'][i, :,
                                         -2:] = np.array(self.env.goal)[:]
                episode_buffer['next_s'][i] = episode['next_s'][i]
                episode_buffer['next_avail_a'][i] = episode['next_avail_a'][i]
                episode_buffer['done'][i] = [done]
                episode_buffer['padded'][i] = [0.]
                if done:
                    break
            for key in episode_buffer.keys():
                episode_buffer[key] = np.array([episode_buffer[key]])
            self.replay_buffer.store(episode_buffer)
Exemple #3
0
class Runner:
    def __init__(self, env, args, itr):
        # 获取参数
        # self.args = get_common_args()
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.agents = Agents(args, itr=itr)
        # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络
        # if not self.args.reuse_network:
        #     self.agents = []
        #     for i in range(self.args.n_agents):
        #         self.agents.append(Agents(self.args, i))

        # self.rollout = RollOut(self.agents, self.args)

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str(
            itr)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        print('runner 初始化')

    def generate_episode(self, episode_num, evaluate=False):
        # 为保存评价的回放做准备
        if self.args.replay_dir != '' and evaluate and episode_num == 0:
            self.env.close()
        # 变量初始化
        self.env.reset()
        done = False
        info = None
        win = False

        last_action = np.zeros((self.args.n_agents, self.args.n_actions))
        # epsilon 递减
        epsilon = 0 if evaluate else self.args.epsilon
        # epsilon 递减的方式
        if self.args.epsilon_anneal_scale == 'episode' or \
                (self.args.epsilon_anneal_scale == 'itr' and episode_num == 0):
            epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 记录一个episode的信息
        episode_buffer = None
        if not evaluate:
            episode_buffer = {
                'o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                's':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'a':
                np.zeros([self.args.episode_limit, self.args.n_agents, 1]),
                'onehot_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'r':
                np.zeros([self.args.episode_limit, 1]),
                'next_o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                'next_s':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'next_avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'done':
                np.ones([self.args.episode_limit, 1]),
                'padded':
                np.ones([self.args.episode_limit, 1])
            }
        # 开始进行一波 episode
        obs = self.env.get_obs()
        state = self.env.get_state()
        avail_actions = []
        self.agents.policy.init_hidden(1)
        for agent_id in range(self.args.n_agents):
            avail_action = self.env.get_avail_agent_actions(agent_id)
            avail_actions.append(avail_action)

        episode_reward = 0
        for step in range(self.args.episode_limit):
            if done:
                break
            else:
                actions, onehot_actions = [], []
                for agent_id in range(self.args.n_agents):
                    # avail_action = self.env.get_avail_agent_actions(agent_id)
                    action = self.agents.choose_action(obs[agent_id],
                                                       last_action[agent_id],
                                                       agent_id,
                                                       avail_actions[agent_id],
                                                       epsilon, evaluate)
                    # 得到该动作的独热编码
                    onehot_action = np.zeros(self.args.n_actions)
                    onehot_action[action] = 1
                    onehot_actions.append(onehot_action)
                    # 加入联合动作
                    actions.append(action)
                    # avail_actions.append(avail_action)
                    # 记录该动作
                    last_action[agent_id] = onehot_action
                # 对环境执行联合动作
                reward, done, info = self.env.step(actions)
                # 获取改变后的信息
                if not done:
                    next_obs = self.env.get_obs()
                    next_state = self.env.get_state()
                else:
                    next_obs = obs
                    next_state = state
                # 添加可得动作
                next_avail_actions = []
                for agent_id in range(self.args.n_agents):
                    avail_action = self.env.get_avail_agent_actions(agent_id)
                    next_avail_actions.append(avail_action)
                # 添加经验
                if not evaluate:
                    episode_buffer['o'][step] = obs
                    episode_buffer['s'][step] = state
                    episode_buffer['a'][step] = np.reshape(
                        actions, [self.args.n_agents, 1])
                    episode_buffer['onehot_a'][step] = onehot_actions
                    episode_buffer['avail_a'][step] = avail_actions
                    episode_buffer['r'][step] = [reward]
                    episode_buffer['next_o'][step] = next_obs
                    episode_buffer['next_s'][step] = next_state
                    episode_buffer['next_avail_a'][step] = next_avail_actions
                    episode_buffer['done'][step] = [done]
                    episode_buffer['padded'][step] = [0.]

                # 更新变量
                episode_reward += reward
                obs = next_obs
                state = next_state
                avail_actions = next_avail_actions
                if self.args.epsilon_anneal_scale == 'step':
                    epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 是训练则记录新的epsilon
        if not evaluate:
            self.args.epsilon = epsilon
        # 获取对局信息
        if info.__contains__('battle_won'):
            win = True if done and info['battle_won'] else False
        if evaluate and episode_num == self.args.evaluate_num - 1 and self.args.replay_dir != '':
            self.env.save_replay()
            self.env.close()
        return episode_buffer, episode_reward, win

    def run(self):
        train_steps = 0
        early_stop = 10
        num_eval = 0
        self.max_win_rate = 0

        for itr in range(self.args.n_itr):
            # 收集 n_episodes 的数据
            episode_batch, _, _ = self.generate_episode(0)
            for key in episode_batch.keys():
                episode_batch[key] = np.array([episode_batch[key]])
            for e in range(1, self.args.n_episodes):
                episode, _, _ = self.generate_episode(e)
                for key in episode_batch.keys():
                    episode[key] = np.array([episode[key]])
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)

            # 添加到 replay buffer
            self.replay_buffer.store(episode_batch)
            # 训练
            if self.replay_buffer.size < self.args.batch_size * 12.5:
                # print('replay buffer 还没 batch size 大')
                continue
            for _ in range(self.args.train_steps):
                batch = self.replay_buffer.sample(self.args.batch_size)
                self.agents.train(batch, train_steps)
                # if self.args.reuse_network:
                #     self.agents.train(batch, train_steps)
                # else:
                #     for i in range(self.args.n_agents):
                #         self.agents[i].train(batch, train_steps)
                train_steps += 1
            # 周期性评价
            if itr % self.args.evaluation_period == 0:
                num_eval += 1
                print(f'进程 {self.pid}: {itr} / {self.args.n_itr}')
                win_rate, episodes_reward = self.evaluate()
                # 保存测试结果
                self.evaluate_itr.append(itr)
                self.win_rates.append(win_rate)
                self.episodes_rewards.append(episodes_reward)
                # 表现好的模型要额外保存
                if win_rate > self.max_win_rate:
                    self.max_win_rate = win_rate
                    self.agents.policy.save_model(str(win_rate))
                # 不时刻保存,从而减少时间花费
                if num_eval % 50 == 0:
                    self.save_results()
                    self.plot()
        # 最后把所有的都保存一下
        self.save_results()
        self.plot()
        self.env.close()

    def evaluate(self):
        """
        得到平均胜率和每次测试的累加奖赏,方便画误差阴影图
        :return:
        """
        win_number = 0
        episodes_reward = []
        for itr in range(self.args.evaluate_num):
            _, episode_reward, win = self.generate_episode(itr, evaluate=True)
            episodes_reward.append(episode_reward)
            if win:
                win_number += 1
        return win_number / self.args.evaluate_num, episodes_reward

    def save_results(self):
        """
        保存数据,方便后面多种算法结果画在一张图里比较
        :return:
        """
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.npy'):
                os.remove(self.save_path + '/' + filename)
        np.save(self.save_path + '/evaluate_itr.npy', self.evaluate_itr)
        np.save(self.save_path + '/win_rates.npy', self.win_rates)
        np.save(self.save_path + '/episodes_rewards.npy',
                self.episodes_rewards)

    def plot(self):
        """
        定期绘图
        :return:
        """
        fig = plt.figure()
        ax1 = fig.add_subplot(211)
        win_x = np.array(self.evaluate_itr)[:, None]
        win_y = np.array(self.win_rates)[:, None]
        plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1),
                                columns=['evaluate_itr', 'win_rates'])
        sns.lineplot(x="evaluate_itr", y="win_rates", data=plot_win, ax=ax1)

        ax2 = fig.add_subplot(212)
        reward_x = np.repeat(self.evaluate_itr, self.args.evaluate_num)[:,
                                                                        None]
        reward_y = np.array(self.episodes_rewards).flatten()[:, None]
        plot_reward = pd.DataFrame(
            np.concatenate((reward_x, reward_y), axis=1),
            columns=['evaluate_itr', 'episodes_rewards'])
        sns.lineplot(x="evaluate_itr",
                     y="episodes_rewards",
                     data=plot_reward,
                     ax=ax2,
                     ci=68,
                     estimator=np.median)

        # 格式化成2016-03-20-11_45_39形式
        tag = self.args.alg + '-' + time.strftime("%Y-%m-%d_%H-%M-%S",
                                                  time.localtime())
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.png'):
                os.remove(self.save_path + '/' + filename)
        fig.savefig(self.save_path + "/%s.png" % tag)
        plt.close()