Esempio n. 1
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
Esempio n. 2
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.qmix_pg_learner = QMIX_PG(self.agents, args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.actor_critic_buffer = ReplayBuffer(args, args.buffer_size)
            # self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        tmp = f'clamp2-5_rewardscale10_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                           f'{args.actor_update_delay}_{args.critic_lr}_{args.n_epoch}_{args.temp}'  # f'clamp2-5_'+ rewardscale10_
        self.save_path = self.args.result_dir + '/linear_mix/' + 'mcsac' + '/' + tmp + '/' + args.map  # _gradclip0.5

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 3
0
    def __init__(self, curriculum, args, target_env):
        self.target_env = target_env
        self.curriculum = curriculum

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(None, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(None, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = None
        self.args = args
        self.win_rates = []
        self.eval_episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = args.save_path
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.train_rewards = []
        self.ratios = []
        self.historical_params = {}
        self.switch = True  # we will be switching to some task
        self.patience = 20
        self.writer: SummaryWriter = None
        self.eval_envs = None
        self.debug = False
Esempio n. 4
0
 def __init__(self, args, env):
     self.noise = args.noise_eps
     self.epsilon = args.epsilon
     self.env = env
     self.agent = Agent(args)
     self.her_module = HerSampler(args.replay_strategy, args.replay_k,
                                  env.compute_reward)
     self.buffer = Buffer(args, self.her_module.sample_her_transitions)
     self.worker = RolloutWorker(self.env, self.agent, args)
     self.args = args
Esempio n. 5
0
class Runner:
    def __init__(self, args, env):
        self.noise = args.noise_eps
        self.epsilon = args.epsilon
        self.env = env
        self.agent = Agent(args)
        self.her_module = HerSampler(args.replay_strategy, args.replay_k,
                                     env.compute_reward)
        self.buffer = Buffer(args, self.her_module.sample_her_transitions)
        self.worker = RolloutWorker(self.env, self.agent, args)
        self.args = args

    def run(self):
        success_rates = []
        for epoch in tqdm(range(self.args.n_epochs)):
            for episode_idx in range(self.args.n_cycles):
                episode = self.worker.generate_episode(self.noise,
                                                       self.epsilon)
                episode_batch = convert_episode_to_batch_major(
                    episode)  # 把episode中的二维数据变成三维的
                self.buffer.store_episode(episode_batch)
                episode_batch['o_next'], episode_batch[
                    'ag_next'] = episode_batch['o'][:, 1:], episode_batch[
                        'ag'][:, 1:]
                transitions = self.her_module.sample_her_transitions(
                    episode_batch, self.args.episode_limit)

                # update the normalizer
                self.agent.update_normalizer(transitions)

            for _ in range(self.args.n_batches):
                transitions = self.buffer.sample(self.args.batch_size)
                self.agent.learn(transitions)
            # self.noise = max(0, self.noise - 0.001)
            # self.epsilon = max(0.05, self.noise - 0.001)
            if len(success_rates) > 0 and success_rates[-1] > 0.5:
                success_rate = self.worker.evaluate(render=True)
            else:
                success_rate = self.worker.evaluate()
            success_rates.append(success_rate)
        save_path = self.args.save_dir + '/' + self.args.env_name
        plt.figure()
        plt.plot(range(self.args.n_epochs), success_rates)
        plt.xlabel('epoch')
        plt.ylabel('success_rate')
        plt.savefig(save_path + '/plt.png', format='png')
Esempio n. 6
0
    def __init__(self, env, args):
        self.env = env

        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 7
0
    def __init__(self, env, args):
        self.env = env
        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        self.buffer = ReplayBuffer(args)
        self.args = args
        self.epsilon = args.epsilon

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                            args)
Esempio n. 8
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        self.agents = Agents(args)
        self.qmix_pg_learner = QMIX_PG(self.agents, args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find(
                'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size)
            self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                                             f'{args.actor_update_delay}_{args.critic_lr}'  # f'clamp2-5_'+  anneal_epsilon
        self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 9
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 10
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)

        if args.alg == 'commnet_coma':
            self.agents = CommNetAgents(args)
            self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate,
                                                       self.agents, args)
        else:
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                                args)
        if args.alg != 'coma' and args.alg != 'commnet_coma':
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 11
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Esempio n. 12
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def run(self, num):
        plt.figure()
        plt.axis([0, self.args.n_epoch, 0, 100])
        win_rates = []
        episode_rewards = []
        train_steps = 0
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            print('Run {}, train epoch {}'.format(num, epoch))
            if epoch % self.args.evaluate_cycle == 0:
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                win_rates.append(win_rate)
                episode_rewards.append(episode_reward)
                plt.cla()
                plt.subplot(2, 1, 1)
                plt.plot(range(len(win_rates)), win_rates)
                plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
                plt.ylabel('win_rate')

                plt.subplot(2, 1, 2)
                plt.plot(range(len(episode_rewards)), episode_rewards)
                plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
                plt.ylabel('episode_rewards')

                plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
                np.save(self.save_path + '/win_rates_{}'.format(num), win_rates)
                np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _ = self.rolloutWorker.generate_episode(episode_idx)
                episodes.append(episode)
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1


        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(win_rates)), win_rates)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rate')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True)
            episode_rewards += episode_reward
            if episode_reward > self.args.threshold:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def evaluate_sparse(self):
        win_number = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True)
            result = 'win' if episode_reward > 0 else 'defeat'
            print('Epoch {}: {}'.format(epoch, result))
            if episode_reward > 0:
                win_number += 1
        self.env_evaluate.close()
        return win_number / self.args.evaluate_epoch
Esempio n. 13
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.fig = None

    def run(self, num):
        global EPOCH
        train_steps = 0
        # print('Run {} start'.format(num))
        self.env.reset_callback = reset_callback  #TODO
        for epoch in range(self.args.n_epoch):
            EPOCH = epoch
            # print('Run {}, train epoch {}'.format(num, epoch))
            if epoch % self.args.evaluate_cycle == 0:
                # print('Run {}, train epoch {}, evaluating'.format(num, epoch))
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                self.win_rates.append(self.rolloutWorker.epsilon)
                self.episode_rewards.append(episode_reward)
                self.plt(num)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _, _ = self.rolloutWorker.generate_episode(
                    episode_idx)
                episodes.append(episode)
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(
                        min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self, num):
        if self.fig is None:
            self.fig = plt.figure()

        fig = self.fig
        plt.axis([0, self.args.n_epoch, 0, 100])
        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('epsilon')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(self.episode_rewards)), self.episode_rewards)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')
        plt.tight_layout()

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num),
                self.episode_rewards)
        plt.clf()
Esempio n. 14
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time

    def run(self, num):
        train_steps = 0
        episode_rewards = 0
        fixed_rewards = 0
        st = time.time()
        plot_rewards = []
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            # print('Run {}, train epoch {}'.format(num, epoch))
            # if epoch % self.args.evaluate_cycle == 0:
            #     win_rate, episode_reward = self.evaluate()
            #     # print('win_rate is ', win_rate)
            #     self.win_rates.append(win_rate)
            #     self.episode_rewards.append(episode_reward)
            #     print(episode_reward)
            #     # self.plt(num)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                if self.args.use_ja:
                    if self.args.use_v1:
                        episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2(
                            episode_idx)
                    else:
                        episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3(
                            episode_idx)
                else:
                    episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode(
                        episode_idx)
                episodes.append(episode)
                episode_rewards += episode_reward
                fixed_rewards += fixed_reward
                plot_rewards.append(episode_reward)
                if epoch % self.args.evaluate_cycle == 0:
                    t = time.time() - st
                    st = time.time()
                    epr = round(episode_rewards / self.args.evaluate_cycle, 2)
                    fr = round(fixed_rewards / self.args.evaluate_cycle, 2)
                    print('train epoch {}, reward {}, time {}, rate {}'.format(
                        epoch, [epr, fr], t, rate))
                    # wandb.log({"reward": epr, "test_reward": epr})
                    episode_rewards = 0
                    fixed_rewards = 0
                    with open(self.file_name, 'wb') as fp:
                        pickle.dump(plot_rewards, fp)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            elif not self.args.load_model:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size))
                    # # print(mini_batch['terminated'])
                    # # print(train_steps)
                    # dq = self.agents.train(mini_batch, train_steps)
                    if self.args.use_per:
                        mini_batch, idxs = self.buffer.sample(
                            min(self.buffer.current_size,
                                self.args.batch_size))
                        dq = self.agents.train(mini_batch, train_steps)
                        self.buffer.update_priorities(idxs, dq)
                    else:
                        mini_batch = self.buffer.sample(
                            min(self.buffer.current_size,
                                self.args.batch_size))
                        dq = self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        # self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
Esempio n. 15
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def run(self, num):
        time_steps, train_steps, evaluate_steps = 0, 0, -1
        while time_steps < self.args.n_steps:
            print('Run {}, time_steps {}'.format(num, time_steps))
            if time_steps // self.args.evaluate_cycle > evaluate_steps:
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                self.win_rates.append(win_rate)
                self.episode_rewards.append(episode_reward)
                self.plt(num)
                evaluate_steps += 1
            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _, _, steps = self.rolloutWorker.generate_episode(
                    episode_idx)
                episodes.append(episode)
                time_steps += steps
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(
                        min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        win_rate, episode_reward = self.evaluate()
        print('win_rate is ', win_rate)
        self.win_rates.append(win_rate)
        self.episode_rewards.append(episode_reward)
        self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self, num):
        plt.figure()
        plt.ylim([0, 105])
        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rates')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(self.episode_rewards)), self.episode_rewards)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num),
                self.episode_rewards)
        plt.close()
Esempio n. 16
0
class Runner:
    def __init__(self, env, args):
        self.env = env
        self.args = args

        self.agents = Agents(args)
        self.qmix_pg_learner = QMIX_PG(self.agents, args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find(
                'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size)
            self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                                             f'{args.actor_update_delay}_{args.critic_lr}'  # f'clamp2-5_'+  anneal_epsilon
        self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def run(self, num):
        train_steps = 0
        epsilon = self.args.epsilon  # 初始epsilon
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            print('Run {}, train epoch {}'.format(num, epoch))
            if epoch % self.args.evaluate_cycle == 0:  # 100
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                self.win_rates.append(win_rate)
                self.episode_rewards.append(episode_reward)
                self.plt(num)

            episodes = []
            if self.args.epsilon_anneal_scale == 'epoch':
                epsilon = epsilon - self.args.anneal_epsilon if epsilon > self.args.min_epsilon else epsilon
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):  # 1
                episode, _, _ = self.rolloutWorker.generate_episode(episode_idx, evaluate=False, epsilon=epsilon)
                episodes.append(episode)
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find(
                    'reinforce') > -1:
                self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.critic_buffer.store_episode(episode_batch)
                self.actor_buffer.store_episode(episode_batch)
                # if epoch % 16 == 0:  # 2
                for train_step in range(self.args.critic_train_steps):  # 1  # 16
                    mini_batch = self.critic_buffer.sample(
                        min(self.critic_buffer.current_size, self.args.critic_batch_size))  # 32 episodes # 16
                    self.qmix_pg_learner.train_critic(mini_batch, self.args.episode_limit, train_steps)
                    train_steps += 1
                if epoch % self.args.actor_update_delay == 0:  # 2
                    for train_step in range(self.args.actor_train_steps):  # 1 # 16
                        mini_batch = self.actor_buffer.sample(
                            min(self.actor_buffer.current_size, self.args.actor_batch_size))  # 16 episodes  # 16
                        self.qmix_pg_learner.train_actor(mini_batch, self.args.episode_limit)
        self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag = self.rolloutWorker.generate_episode(epoch, evaluate=True, epsilon=0)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self, num):
        plt.figure()
        plt.axis([0, self.args.n_epoch, 0, 100])
        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.ylabel('win_rate')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(self.episode_rewards)), self.episode_rewards)
        plt.xlabel('episodes*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards)
Esempio n. 17
0
class Runner:
    def __init__(self, env, args):
        self.env = env
        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        self.buffer = ReplayBuffer(args)
        self.args = args
        self.epsilon = args.epsilon

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                            args)

    def run(self):
        plt.figure()
        plt.axis([0, self.args.n_epoch, 0, 100])
        win_rates = []
        episode_rewards = []
        train_steps = 0
        for epoch in tqdm(range(self.args.n_epoch)):
            # print('Train epoch {} start'.format(epoch))
            self.epsilon = self.epsilon - 0.0001125 if self.epsilon > 0.05 else self.epsilon
            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _ = self.rolloutWorker.generate_episode(self.epsilon)
                episodes.append(episode)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            self.buffer.store_episode(episode_batch)
            if self.buffer.current_size > 100:
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(self.args.batch_size)
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                win_rates.append(win_rate)
                episode_rewards.append(episode_reward)
                # 可视化
                if epoch % 100 == 0:
                    plt.cla()
                    plt.subplot(2, 1, 1)
                    plt.plot(range(len(win_rates)), win_rates)
                    plt.xlabel('epoch')
                    plt.ylabel('win_rate')

                    plt.subplot(2, 1, 2)
                    plt.plot(range(len(episode_rewards)), episode_rewards)
                    plt.xlabel('epoch')
                    plt.ylabel('episode_rewards')

                    plt.savefig(self.args.result_dir + '/plt.png',
                                format='png')
                    np.save(self.args.result_dir + '/win_rates', win_rates)
                    np.save(self.args.result_dir + '/episode_rewards',
                            episode_rewards)

        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(win_rates)), win_rates)
        plt.xlabel('epoch')
        plt.ylabel('win_rate')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('epoch')
        plt.ylabel('episode_rewards')

        plt.savefig(self.args.result_dir + '/plt.png', format='png')
        np.save(self.args.result_dir + '/win_rates', win_rates)
        np.save(self.args.result_dir + '/episode_rewards', episode_rewards)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.rolloutWorker.generate_episode(0)
            episode_rewards += episode_reward
            if episode_reward > self.args.threshold:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def evaluate_sparse(self):
        win_number = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.evaluateWorker.generate_episode(0)
            result = 'win' if episode_reward > 0 else 'defeat'
            print('Epoch {}: {}'.format(epoch, result))
            if episode_reward > 0:
                win_number += 1
        return win_number / self.args.evaluate_epoch
Esempio n. 18
0
class Runner:
    def __init__(self, curriculum, args, target_env):
        self.target_env = target_env
        self.curriculum = curriculum

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(None, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(None, self.agents, args)
        if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = None
        self.args = args
        self.win_rates = []
        self.eval_episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = args.save_path
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.train_rewards = []
        self.ratios = []
        self.historical_params = {}
        self.switch = True  # we will be switching to some task
        self.patience = 20
        self.writer: SummaryWriter = None
        self.eval_envs = None
        self.debug = False

    def run(self):
        time_steps, train_steps, evaluate_steps = 0, 0, -1
        while True:
            if time_steps // self.args.evaluate_cycle > evaluate_steps:
                win_rate, eval_episode_reward = self.evaluate(
                    time_steps, self.target_env)
                self.win_rates.append(win_rate)
                self.eval_episode_rewards.append(eval_episode_reward)
                self.plt()
                evaluate_steps += 1

                performance = int(eval_episode_reward)
                self.curriculum.update(performance, self.agents, time_steps,
                                       train_steps)

                # eval in other envs
                for env in self.eval_envs:
                    self.evaluate(time_steps, env)

            try:
                env = self.curriculum.get()
                buffer = env.buffer
                self.rolloutWorker.env = env
                logging.info("Restoring map {}".format(
                    self.rolloutWorker.env.map_name))
            except IndexError:  # done
                self.agents.policy.save_model(train_step)
                self.plt()
                break

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, train_episode_reward, _, steps = self.rolloutWorker.generate_episode(
                    episode_idx)
                self.train_rewards.append(train_episode_reward)
                episodes.append(episode)
                time_steps += steps

            logging.info('Time_steps {}, train_episode_reward {}'.format(
                time_steps, train_episode_reward))
            # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = buffer.sample(
                        min(buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1

            self.writer.add_scalar(f'Reward/train/',
                                   train_episode_reward,
                                   global_step=time_steps)
            self.writer.add_scalar(f'Reward/train/{env.map_name}',
                                   train_episode_reward,
                                   global_step=time_steps)
            if self.debug:
                for n, p in self.agents.policy.eval_rnn.named_parameters():
                    self.writer.add_scalar(f'eval_rnn/{n}/norm',
                                           p.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(f'eval_rnn/grad/{n}/norm',
                                           p.grad.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(f'eval_rnn/{n}/norm/{env.map_name}',
                                           p.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(
                        f'eval_rnn/grad/{n}/norm/{env.map_name}',
                        p.grad.norm(),
                        global_step=time_steps)
                for n, p in self.agents.policy.eval_qmix_net.named_parameters(
                ):
                    self.writer.add_scalar(f'eval_qmix_net/{n}/norm',
                                           p.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(f'eval_qmix_net/grad/{n}/norm',
                                           p.grad.norm(),
                                           global_step=time_steps)
                    self.writer.add_scalar(
                        f'eval_qmix_net/{n}/norm/{env.map_name}',
                        p.norm(),
                        global_step=time_steps)
                    self.writer.add_scalar(
                        f'eval_qmix_net/grad/{n}/norm/{env.map_name}',
                        p.grad.norm(),
                        global_step=time_steps)

    def evaluate(self, time_steps, env):
        win_number = 0
        episode_rewards = 0
        self.rolloutWorker.env = env
        logging.info("Evaluating in map {}".format(
            self.rolloutWorker.env.map_name))
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            logging.info('Eval_epoch {}, eval_episode_reward {}'.format(
                epoch, episode_reward))
            episode_rewards += episode_reward
            self.writer.add_scalar(
                f'Reward/eval/{self.rolloutWorker.env.map_name}',
                episode_reward, time_steps + epoch)
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def plt(self):
        plt.figure().set_size_inches(10, 15)
        plt.ylim([0, 105])
        plt.cla()
        plt.subplot(3, 1, 1)
        plt.plot(range(len(self.win_rates)), self.win_rates)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rates')

        plt.subplot(3, 1, 2)
        plt.plot(range(len(self.eval_episode_rewards)),
                 self.eval_episode_rewards)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('eval_episode_rewards')

        plt.subplot(3, 1, 3)
        train_rewards = np.array_split(self.train_rewards,
                                       len(self.eval_episode_rewards))
        mean_train_rewards = [np.mean(t) for t in train_rewards]
        plt.plot(range(len((mean_train_rewards))), mean_train_rewards)
        plt.xlabel('step*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('train_episode_rewards')

        plt.tight_layout()
        plt.savefig(self.save_path + '/plt.png', format='png')
        np.save(self.save_path + '/win_rates', self.win_rates)
        np.save(self.save_path + '/eval_rewards', self.eval_episode_rewards)
        np.save(self.save_path + '/train_rewards', self.train_rewards)
        plt.close()