Example #1
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def run(self, num):
        plt.figure()
        plt.axis([0, self.args.n_epoch, 0, 100])
        win_rates = []
        episode_rewards = []
        train_steps = 0
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            print('Run {}, train epoch {}'.format(num, epoch))
            if epoch % self.args.evaluate_cycle == 0:
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                win_rates.append(win_rate)
                episode_rewards.append(episode_reward)
                plt.cla()
                plt.subplot(2, 1, 1)
                plt.plot(range(len(win_rates)), win_rates)
                plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
                plt.ylabel('win_rate')

                plt.subplot(2, 1, 2)
                plt.plot(range(len(episode_rewards)), episode_rewards)
                plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
                plt.ylabel('episode_rewards')

                plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
                np.save(self.save_path + '/win_rates_{}'.format(num), win_rates)
                np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _ = self.rolloutWorker.generate_episode(episode_idx)
                episodes.append(episode)
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1


        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(win_rates)), win_rates)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rate')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True)
            episode_rewards += episode_reward
            if episode_reward > self.args.threshold:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def evaluate_sparse(self):
        win_number = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True)
            result = 'win' if episode_reward > 0 else 'defeat'
            print('Epoch {}: {}'.format(epoch, result))
            if episode_reward > 0:
                win_number += 1
        self.env_evaluate.close()
        return win_number / self.args.evaluate_epoch
Example #2
0
        plot_episode_requested_agents += episode_episode_requested_agents
        plot_episode_count_requested_agent += episode_episode_count_requested_agent
        plot_episode_rewards.append(episode_reward)
        episodes.append(episode)

    episode_batch = episodes[0]
    episodes.pop(0)
    for episode in episodes:
        for key in episode_batch.keys():
            episode_batch[key] = np.concatenate(
                (episode_batch[key], episode[key]), axis=0)

    buffer.store_episode(episode_batch)
    for train_step in range(args.train_steps):
        mini_batch = buffer.sample(min(buffer.current_size, args.batch_size))
        agents.train(mini_batch, train_steps)
        train_steps += 1

    figure, axes = plt.subplots(nrows=2, ncols=2)

    # plt.rcParams["figure.figsize"] = (50, 50)
    plt.rcParams['lines.linewidth'] = 4

    index1 = ["Action 0", "Action 1", "Action 2"]
    axes[0, 0].bar(x=index1, height=plot_count_per_actions)
    axes[0, 0].set_title('Cumulative count over action space')

    # index2 = ["1 Agents", "2 Agents", "3 Agents", "4 Agents"]
    index2 = [f'{i+1} Agents' for i in range(N_AGENTS)]
    axes[0, 1].bar(x=index2, height=plot_episode_count_requested_agent)
    axes[0, 1].set_title('Number of valid agents over episode')
Example #3
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time

    def run(self, num):
        train_steps = 0
        episode_rewards = 0
        fixed_rewards = 0
        st = time.time()
        plot_rewards = []
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            # print('Run {}, train epoch {}'.format(num, epoch))
            # if epoch % self.args.evaluate_cycle == 0:
            #     win_rate, episode_reward = self.evaluate()
            #     # print('win_rate is ', win_rate)
            #     self.win_rates.append(win_rate)
            #     self.episode_rewards.append(episode_reward)
            #     print(episode_reward)
            #     # self.plt(num)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                if self.args.use_ja:
                    if self.args.use_v1:
                        episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2(
                            episode_idx)
                    else:
                        episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3(
                            episode_idx)
                else:
                    episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode(
                        episode_idx)
                episodes.append(episode)
                episode_rewards += episode_reward
                fixed_rewards += fixed_reward
                plot_rewards.append(episode_reward)
                if epoch % self.args.evaluate_cycle == 0:
                    t = time.time() - st
                    st = time.time()
                    epr = round(episode_rewards / self.args.evaluate_cycle, 2)
                    fr = round(fixed_rewards / self.args.evaluate_cycle, 2)
                    print('train epoch {}, reward {}, time {}, rate {}'.format(
                        epoch, [epr, fr], t, rate))
                    # wandb.log({"reward": epr, "test_reward": epr})
                    episode_rewards = 0
                    fixed_rewards = 0
                    with open(self.file_name, 'wb') as fp:
                        pickle.dump(plot_rewards, fp)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            elif not self.args.load_model:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size))
                    # # print(mini_batch['terminated'])
                    # # print(train_steps)
                    # dq = self.agents.train(mini_batch, train_steps)
                    if self.args.use_per:
                        mini_batch, idxs = self.buffer.sample(
                            min(self.buffer.current_size,
                                self.args.batch_size))
                        dq = self.agents.train(mini_batch, train_steps)
                        self.buffer.update_priorities(idxs, dq)
                    else:
                        mini_batch = self.buffer.sample(
                            min(self.buffer.current_size,
                                self.args.batch_size))
                        dq = self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        # self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
Example #4
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def run(self, num):
        time_steps, train_steps, evaluate_steps = 0, 0, -1
        while time_steps < self.args.n_steps:
            print('Run {}, time_steps {}'.format(num, time_steps))
            if time_steps // self.args.evaluate_cycle > evaluate_steps:
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                self.win_rates.append(win_rate)
                self.episode_rewards.append(episode_reward)
                self.plt(num)
                evaluate_steps += 1
            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _, _, steps = self.rolloutWorker.generate_episode(
                    episode_idx)
                episodes.append(episode)
                time_steps += steps
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(
                        min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        win_rate, episode_reward = self.evaluate()
        print('win_rate is ', win_rate)
        self.win_rates.append(win_rate)
        self.episode_rewards.append(episode_reward)
        self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self, num):
        plt.figure()
        plt.ylim([0, 105])
        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rates')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(self.episode_rewards)), self.episode_rewards)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num),
                self.episode_rewards)
        plt.close()
class Runner:
    def __init__(self, curriculum, args, target_env):
        self.target_env = target_env
        self.curriculum = curriculum

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(None, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(None, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = None
        self.args = args
        self.win_rates = []
        self.eval_episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = args.save_path
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.train_rewards = []
        self.ratios = []
        self.historical_params = {}
        self.switch = True  # we will be switching to some task
        self.patience = 20
        self.writer: SummaryWriter = None
        self.eval_envs = None
        self.debug = False

    def run(self):
        time_steps, train_steps, evaluate_steps = 0, 0, -1
        while True:
            if time_steps // self.args.evaluate_cycle > evaluate_steps:
                win_rate, eval_episode_reward = self.evaluate(
                    time_steps, self.target_env)
                self.win_rates.append(win_rate)
                self.eval_episode_rewards.append(eval_episode_reward)
                self.plt()
                evaluate_steps += 1

                performance = int(eval_episode_reward)
                self.curriculum.update(performance, self.agents, time_steps,
                                       train_steps)

                # eval in other envs
                for env in self.eval_envs:
                    self.evaluate(time_steps, env)

            try:
                env = self.curriculum.get()
                buffer = env.buffer
                self.rolloutWorker.env = env
                logging.info("Restoring map {}".format(
                    self.rolloutWorker.env.map_name))
            except IndexError:  # done
                self.agents.policy.save_model(train_step)
                self.plt()
                break

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, train_episode_reward, _, steps = self.rolloutWorker.generate_episode(
                    episode_idx)
                self.train_rewards.append(train_episode_reward)
                episodes.append(episode)
                time_steps += steps

            logging.info('Time_steps {}, train_episode_reward {}'.format(
                time_steps, train_episode_reward))
            # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = buffer.sample(
                        min(buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1

            self.writer.add_scalar(f'Reward/train/',
                                   train_episode_reward,
                                   global_step=time_steps)
            self.writer.add_scalar(f'Reward/train/{env.map_name}',
                                   train_episode_reward,
                                   global_step=time_steps)
            if self.debug:
                for n, p in self.agents.policy.eval_rnn.named_parameters():
                    self.writer.add_scalar(f'eval_rnn/{n}/norm',
                                           p.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(f'eval_rnn/grad/{n}/norm',
                                           p.grad.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(f'eval_rnn/{n}/norm/{env.map_name}',
                                           p.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(
                        f'eval_rnn/grad/{n}/norm/{env.map_name}',
                        p.grad.norm(),
                        global_step=time_steps)
                for n, p in self.agents.policy.eval_qmix_net.named_parameters(
                ):
                    self.writer.add_scalar(f'eval_qmix_net/{n}/norm',
                                           p.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(f'eval_qmix_net/grad/{n}/norm',
                                           p.grad.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(
                        f'eval_qmix_net/{n}/norm/{env.map_name}',
                        p.norm(),
                        global_step=time_steps)
                    self.writer.add_scalar(
                        f'eval_qmix_net/grad/{n}/norm/{env.map_name}',
                        p.grad.norm(),
                        global_step=time_steps)

    def evaluate(self, time_steps, env):
        win_number = 0
        episode_rewards = 0
        self.rolloutWorker.env = env
        logging.info("Evaluating in map {}".format(
            self.rolloutWorker.env.map_name))
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            logging.info('Eval_epoch {}, eval_episode_reward {}'.format(
                epoch, episode_reward))
            episode_rewards += episode_reward
            self.writer.add_scalar(
                f'Reward/eval/{self.rolloutWorker.env.map_name}',
                episode_reward, time_steps + epoch)
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self):
        plt.figure().set_size_inches(10, 15)
        plt.ylim([0, 105])
        plt.cla()
        plt.subplot(3, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rates')

        plt.subplot(3, 1, 2)
        plt.plot(range(len(self.eval_episode_rewards)),
                 self.eval_episode_rewards)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('eval_episode_rewards')

        plt.subplot(3, 1, 3)
        train_rewards = np.array_split(self.train_rewards,
                                       len(self.eval_episode_rewards))
        mean_train_rewards = [np.mean(t) for t in train_rewards]
        plt.plot(range(len((mean_train_rewards))), mean_train_rewards)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('train_episode_rewards')

        plt.tight_layout()
        plt.savefig(self.save_path + '/plt.png', format='png')
        np.save(self.save_path + '/win_rates', self.win_rates)
        np.save(self.save_path + '/eval_rewards', self.eval_episode_rewards)
        np.save(self.save_path + '/train_rewards', self.train_rewards)
        plt.close()
Example #6
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.fig = None

    def run(self, num):
        global EPOCH
        train_steps = 0
        # print('Run {} start'.format(num))
        self.env.reset_callback = reset_callback  #TODO
        for epoch in range(self.args.n_epoch):
            EPOCH = epoch
            # print('Run {}, train epoch {}'.format(num, epoch))
            if epoch % self.args.evaluate_cycle == 0:
                # print('Run {}, train epoch {}, evaluating'.format(num, epoch))
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                self.win_rates.append(self.rolloutWorker.epsilon)
                self.episode_rewards.append(episode_reward)
                self.plt(num)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _, _ = self.rolloutWorker.generate_episode(
                    episode_idx)
                episodes.append(episode)
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(
                        min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self, num):
        if self.fig is None:
            self.fig = plt.figure()

        fig = self.fig
        plt.axis([0, self.args.n_epoch, 0, 100])
        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('epsilon')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(self.episode_rewards)), self.episode_rewards)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')
        plt.tight_layout()

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num),
                self.episode_rewards)
        plt.clf()
Example #7
0
def runner(env, args):
    model_dir = Path('./models') / args.env_id / args.algo
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    results_dir = run_dir / 'results'

    os.makedirs(str(log_dir))
    os.makedirs(str(results_dir))
    logger = SummaryWriter(str(log_dir))
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if not args.use_cuda:
        torch.set_num_threads(args.n_training_threads)

    agents = Agents(args)
    rolloutWorker = RolloutWorker(env, agents, args)
    buffer = ReplayBuffer(args)

    train_step = 0
    mean_episode_rewards = []

    for ep_i in range(0, args.n_episodes, args.n_rollout_threads):
        print("Episodes %i-%i of %i" %
              (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes))
        if args.display:
            for env_show in env.envs:
                env_show.render('human')

        # Using the RolloutWork to interact with the environment (rollout the episodes >= 1)
        episodes, rews, mean_rews = [], [], []
        for episode_idx in range(args.n_rollouts):
            episode, ep_rew, mean_ep_rew = rolloutWorker.generate_episode(
                episode_idx)
            episodes.append(episode)
            rews.append(ep_rew)
            mean_rews.append(mean_ep_rew)
        episodes_batch = episodes[0]
        episodes.pop(0)
        for episode in episodes:
            for key in episodes_batch.keys():
                episodes_batch[key] = np.concatenate(
                    (episodes_batch[key], episode[key]), axis=0)
        buffer.push(episodes_batch)

        # Algorithms VDN and QMIX need the buffer but not the epsilon to train agents
        if args.algo.find('vdn') > -1 or args.algo.find('qmix') > -1:
            for _ in range(args.training_steps):
                mini_batch = buffer.sample(
                    min(buffer.current_size, args.batch_size))
                agents.train(mini_batch, train_step)
                train_step += 1
        # Algorithms COMA, LIIR, MAAC needs the buffer and the epsilon to train agents
        else:
            for _ in range(args.training_steps):
                mini_batch = buffer.sample(
                    min(buffer.current_size, args.batch_size))
                agents.train(mini_batch, train_step, rolloutWorker.epsilon)
                train_step += 1

        rews = np.mean(rews)
        mean_rews = np.mean(mean_rews)
        mean_episode_rewards.append(mean_rews)
        logger.add_scalar('mean_episode_rewards', mean_rews, ep_i)
        print("Episode {} : Total reward {} , Mean reward {}".format(
            ep_i + 1, rews, mean_rews))

        if ep_i % args.save_cycle < args.n_rollout_threads:
            os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            agents.save(
                str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))))
            agents.save(str(run_dir / 'model.pt'))

    agents.save(str(run_dir / 'model.pt'))
    env.close()
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()

    index = list(range(1, len(mean_episode_rewards) + 1))
    plt.plot(index, mean_episode_rewards)
    plt.ylabel("Mean Episode Rewards")
    plt.savefig(str(results_dir) + '/mean_episode_rewards.jpg')
    # plt.show()
    plt.close()
Example #8
0
class Runner:
    def __init__(self, env, args):
        self.env = env
        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        self.buffer = ReplayBuffer(args)
        self.args = args
        self.epsilon = args.epsilon

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                            args)

    def run(self):
        plt.figure()
        plt.axis([0, self.args.n_epoch, 0, 100])
        win_rates = []
        episode_rewards = []
        train_steps = 0
        for epoch in tqdm(range(self.args.n_epoch)):
            # print('Train epoch {} start'.format(epoch))
            self.epsilon = self.epsilon - 0.0001125 if self.epsilon > 0.05 else self.epsilon
            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _ = self.rolloutWorker.generate_episode(self.epsilon)
                episodes.append(episode)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            self.buffer.store_episode(episode_batch)
            if self.buffer.current_size > 100:
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(self.args.batch_size)
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                win_rates.append(win_rate)
                episode_rewards.append(episode_reward)
                # 可视化
                if epoch % 100 == 0:
                    plt.cla()
                    plt.subplot(2, 1, 1)
                    plt.plot(range(len(win_rates)), win_rates)
                    plt.xlabel('epoch')
                    plt.ylabel('win_rate')

                    plt.subplot(2, 1, 2)
                    plt.plot(range(len(episode_rewards)), episode_rewards)
                    plt.xlabel('epoch')
                    plt.ylabel('episode_rewards')

                    plt.savefig(self.args.result_dir + '/plt.png',
                                format='png')
                    np.save(self.args.result_dir + '/win_rates', win_rates)
                    np.save(self.args.result_dir + '/episode_rewards',
                            episode_rewards)

        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(win_rates)), win_rates)
        plt.xlabel('epoch')
        plt.ylabel('win_rate')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('epoch')
        plt.ylabel('episode_rewards')

        plt.savefig(self.args.result_dir + '/plt.png', format='png')
        np.save(self.args.result_dir + '/win_rates', win_rates)
        np.save(self.args.result_dir + '/episode_rewards', episode_rewards)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.rolloutWorker.generate_episode(0)
            episode_rewards += episode_reward
            if episode_reward > self.args.threshold:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def evaluate_sparse(self):
        win_number = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.evaluateWorker.generate_episode(0)
            result = 'win' if episode_reward > 0 else 'defeat'
            print('Epoch {}: {}'.format(epoch, result))
            if episode_reward > 0:
                win_number += 1
        return win_number / self.args.evaluate_epoch