def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
def __init__(self, env, args): self.env = env self.args = args if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.qmix_pg_learner = QMIX_PG(self.agents, args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.actor_critic_buffer = ReplayBuffer(args, args.buffer_size) # self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl tmp = f'clamp2-5_rewardscale10_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}_{args.n_epoch}_{args.temp}' # f'clamp2-5_'+ rewardscale10_ self.save_path = self.args.result_dir + '/linear_mix/' + 'mcsac' + '/' + tmp + '/' + args.map # _gradclip0.5 if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, curriculum, args, target_env): self.target_env = target_env self.curriculum = curriculum if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(None, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(None, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = None self.args = args self.win_rates = [] self.eval_episode_rewards = [] # 用来保存plt和pkl self.save_path = args.save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.train_rewards = [] self.ratios = [] self.historical_params = {} self.switch = True # we will be switching to some task self.patience = 20 self.writer: SummaryWriter = None self.eval_envs = None self.debug = False
def __init__(self, args, env): self.noise = args.noise_eps self.epsilon = args.epsilon self.env = env self.agent = Agent(args) self.her_module = HerSampler(args.replay_strategy, args.replay_k, env.compute_reward) self.buffer = Buffer(args, self.her_module.sample_her_transitions) self.worker = RolloutWorker(self.env, self.agent, args) self.args = args
class Runner: def __init__(self, args, env): self.noise = args.noise_eps self.epsilon = args.epsilon self.env = env self.agent = Agent(args) self.her_module = HerSampler(args.replay_strategy, args.replay_k, env.compute_reward) self.buffer = Buffer(args, self.her_module.sample_her_transitions) self.worker = RolloutWorker(self.env, self.agent, args) self.args = args def run(self): success_rates = [] for epoch in tqdm(range(self.args.n_epochs)): for episode_idx in range(self.args.n_cycles): episode = self.worker.generate_episode(self.noise, self.epsilon) episode_batch = convert_episode_to_batch_major( episode) # 把episode中的二维数据变成三维的 self.buffer.store_episode(episode_batch) episode_batch['o_next'], episode_batch[ 'ag_next'] = episode_batch['o'][:, 1:], episode_batch[ 'ag'][:, 1:] transitions = self.her_module.sample_her_transitions( episode_batch, self.args.episode_limit) # update the normalizer self.agent.update_normalizer(transitions) for _ in range(self.args.n_batches): transitions = self.buffer.sample(self.args.batch_size) self.agent.learn(transitions) # self.noise = max(0, self.noise - 0.001) # self.epsilon = max(0.05, self.noise - 0.001) if len(success_rates) > 0 and success_rates[-1] > 0.5: success_rate = self.worker.evaluate(render=True) else: success_rate = self.worker.evaluate() success_rates.append(success_rate) save_path = self.args.save_dir + '/' + self.args.env_name plt.figure() plt.plot(range(self.args.n_epochs), success_rates) plt.xlabel('epoch') plt.ylabel('success_rate') plt.savefig(save_path + '/plt.png', format='png')
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
def __init__(self, env, args): self.env = env self.args = args self.agents = Agents(args) self.qmix_pg_learner = QMIX_PG(self.agents, args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size) self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size) self.args = args self.win_rates = [] self.episode_rewards = [] tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}' # f'clamp2-5_'+ anneal_epsilon self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) if args.alg == 'commnet_coma': self.agents = CommNetAgents(args) self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args) self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate, self.agents, args) else: self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg != 'coma' and args.alg != 'commnet_coma': self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path)
class Runner: def __init__(self, env, args): self.env = env # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 ''' self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) ''' self.env_evaluate = MeetEnv() if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 self.env_evaluate.close() return win_number / self.args.evaluate_epoch
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.fig = None def run(self, num): global EPOCH train_steps = 0 # print('Run {} start'.format(num)) self.env.reset_callback = reset_callback #TODO for epoch in range(self.args.n_epoch): EPOCH = epoch # print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: # print('Run {}, train epoch {}, evaluating'.format(num, epoch)) win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(self.rolloutWorker.epsilon) self.episode_rewards.append(episode_reward) self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _, _ = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): if self.fig is None: self.fig = plt.figure() fig = self.fig plt.axis([0, self.args.n_epoch, 0, 100]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('epsilon') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.tight_layout() plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards) plt.clf()
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy if args.use_per: self.buffer = PrioritizedReplayBuffer(args) else: self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.map + '/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.file_name = self.save_path + str(args.env_name) + '_' + str( args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time def run(self, num): train_steps = 0 episode_rewards = 0 fixed_rewards = 0 st = time.time() plot_rewards = [] # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): # print('Run {}, train epoch {}'.format(num, epoch)) # if epoch % self.args.evaluate_cycle == 0: # win_rate, episode_reward = self.evaluate() # # print('win_rate is ', win_rate) # self.win_rates.append(win_rate) # self.episode_rewards.append(episode_reward) # print(episode_reward) # # self.plt(num) episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): if self.args.use_ja: if self.args.use_v1: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3( episode_idx) else: episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) episode_rewards += episode_reward fixed_rewards += fixed_reward plot_rewards.append(episode_reward) if epoch % self.args.evaluate_cycle == 0: t = time.time() - st st = time.time() epr = round(episode_rewards / self.args.evaluate_cycle, 2) fr = round(fixed_rewards / self.args.evaluate_cycle, 2) print('train epoch {}, reward {}, time {}, rate {}'.format( epoch, [epr, fr], t, rate)) # wandb.log({"reward": epr, "test_reward": epr}) episode_rewards = 0 fixed_rewards = 0 with open(self.file_name, 'wb') as fp: pickle.dump(plot_rewards, fp) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 elif not self.args.load_model: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size)) # # print(mini_batch['terminated']) # # print(train_steps) # dq = self.agents.train(mini_batch, train_steps) if self.args.use_per: mini_batch, idxs = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) self.buffer.update_priorities(idxs, dq) else: mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) dq = self.agents.train(mini_batch, train_steps) train_steps += 1 # self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
class Runner: def __init__(self, env, args): self.env = env if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(env, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = ReplayBuffer(args) self.args = args self.win_rates = [] self.episode_rewards = [] # 用来保存plt和pkl self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): time_steps, train_steps, evaluate_steps = 0, 0, -1 while time_steps < self.args.n_steps: print('Run {}, time_steps {}'.format(num, time_steps)) if time_steps // self.args.evaluate_cycle > evaluate_steps: win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) evaluate_steps += 1 episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _, _, steps = self.rolloutWorker.generate_episode( episode_idx) episodes.append(episode) time_steps += steps # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample( min(self.buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 win_rate, episode_reward = self.evaluate() print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode( epoch, evaluate=True) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): plt.figure() plt.ylim([0, 105]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rates') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards) plt.close()
class Runner: def __init__(self, env, args): self.env = env self.args = args self.agents = Agents(args) self.qmix_pg_learner = QMIX_PG(self.agents, args) self.rolloutWorker = RolloutWorker(env, self.agents, args) if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size) self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size) self.args = args self.win_rates = [] self.episode_rewards = [] tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}' # f'clamp2-5_'+ anneal_epsilon self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map if not os.path.exists(self.save_path): os.makedirs(self.save_path) def run(self, num): train_steps = 0 epsilon = self.args.epsilon # 初始epsilon # print('Run {} start'.format(num)) for epoch in range(self.args.n_epoch): print('Run {}, train epoch {}'.format(num, epoch)) if epoch % self.args.evaluate_cycle == 0: # 100 win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) self.win_rates.append(win_rate) self.episode_rewards.append(episode_reward) self.plt(num) episodes = [] if self.args.epsilon_anneal_scale == 'epoch': epsilon = epsilon - self.args.anneal_epsilon if epsilon > self.args.min_epsilon else epsilon # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): # 1 episode, _, _ = self.rolloutWorker.generate_episode(episode_idx, evaluate=False, epsilon=epsilon) episodes.append(episode) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find( 'reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: self.critic_buffer.store_episode(episode_batch) self.actor_buffer.store_episode(episode_batch) # if epoch % 16 == 0: # 2 for train_step in range(self.args.critic_train_steps): # 1 # 16 mini_batch = self.critic_buffer.sample( min(self.critic_buffer.current_size, self.args.critic_batch_size)) # 32 episodes # 16 self.qmix_pg_learner.train_critic(mini_batch, self.args.episode_limit, train_steps) train_steps += 1 if epoch % self.args.actor_update_delay == 0: # 2 for train_step in range(self.args.actor_train_steps): # 1 # 16 mini_batch = self.actor_buffer.sample( min(self.actor_buffer.current_size, self.args.actor_batch_size)) # 16 episodes # 16 self.qmix_pg_learner.train_actor(mini_batch, self.args.episode_limit) self.plt(num) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag = self.rolloutWorker.generate_episode(epoch, evaluate=True, epsilon=0) episode_rewards += episode_reward if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self, num): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(self.episode_rewards)), self.episode_rewards) plt.xlabel('episodes*{}'.format(self.args.evaluate_cycle)) plt.ylabel('episode_rewards') plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png') np.save(self.save_path + '/win_rates_{}'.format(num), self.win_rates) np.save(self.save_path + '/episode_rewards_{}'.format(num), self.episode_rewards)
class Runner: def __init__(self, env, args): self.env = env self.agents = Agents(args) self.rolloutWorker = RolloutWorker(env, self.agents, args) self.buffer = ReplayBuffer(args) self.args = args self.epsilon = args.epsilon # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0 self.env_evaluate = StarCraft2Env(map_name=args.map, step_mul=args.step_mul, difficulty=args.difficulty, game_version=args.game_version, seed=args.seed, replay_dir=args.replay_dir, reward_sparse=True, reward_scale=False) self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args) def run(self): plt.figure() plt.axis([0, self.args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 for epoch in tqdm(range(self.args.n_epoch)): # print('Train epoch {} start'.format(epoch)) self.epsilon = self.epsilon - 0.0001125 if self.epsilon > 0.05 else self.epsilon episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, _ = self.rolloutWorker.generate_episode(self.epsilon) episodes.append(episode) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) self.buffer.store_episode(episode_batch) if self.buffer.current_size > 100: for train_step in range(self.args.train_steps): mini_batch = self.buffer.sample(self.args.batch_size) self.agents.train(mini_batch, train_steps) train_steps += 1 win_rate, episode_reward = self.evaluate() # print('win_rate is ', win_rate) win_rates.append(win_rate) episode_rewards.append(episode_reward) # 可视化 if epoch % 100 == 0: plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch') plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch') plt.ylabel('episode_rewards') plt.savefig(self.args.result_dir + '/plt.png', format='png') np.save(self.args.result_dir + '/win_rates', win_rates) np.save(self.args.result_dir + '/episode_rewards', episode_rewards) plt.cla() plt.subplot(2, 1, 1) plt.plot(range(len(win_rates)), win_rates) plt.xlabel('epoch') plt.ylabel('win_rate') plt.subplot(2, 1, 2) plt.plot(range(len(episode_rewards)), episode_rewards) plt.xlabel('epoch') plt.ylabel('episode_rewards') plt.savefig(self.args.result_dir + '/plt.png', format='png') np.save(self.args.result_dir + '/win_rates', win_rates) np.save(self.args.result_dir + '/episode_rewards', episode_rewards) def evaluate(self): win_number = 0 episode_rewards = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.rolloutWorker.generate_episode(0) episode_rewards += episode_reward if episode_reward > self.args.threshold: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def evaluate_sparse(self): win_number = 0 for epoch in range(self.args.evaluate_epoch): _, episode_reward = self.evaluateWorker.generate_episode(0) result = 'win' if episode_reward > 0 else 'defeat' print('Epoch {}: {}'.format(epoch, result)) if episode_reward > 0: win_number += 1 return win_number / self.args.evaluate_epoch
class Runner: def __init__(self, curriculum, args, target_env): self.target_env = target_env self.curriculum = curriculum if args.alg.find('commnet') > -1 or args.alg.find( 'g2anet') > -1: # communication agent self.agents = CommAgents(args) self.rolloutWorker = CommRolloutWorker(None, self.agents, args) else: # no communication agent self.agents = Agents(args) self.rolloutWorker = RolloutWorker(None, self.agents, args) if not args.evaluate and args.alg.find('coma') == -1 and args.alg.find( 'central_v') == -1 and args.alg.find( 'reinforce') == -1: # these 3 algorithms are on-poliy self.buffer = None self.args = args self.win_rates = [] self.eval_episode_rewards = [] # 用来保存plt和pkl self.save_path = args.save_path if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.train_rewards = [] self.ratios = [] self.historical_params = {} self.switch = True # we will be switching to some task self.patience = 20 self.writer: SummaryWriter = None self.eval_envs = None self.debug = False def run(self): time_steps, train_steps, evaluate_steps = 0, 0, -1 while True: if time_steps // self.args.evaluate_cycle > evaluate_steps: win_rate, eval_episode_reward = self.evaluate( time_steps, self.target_env) self.win_rates.append(win_rate) self.eval_episode_rewards.append(eval_episode_reward) self.plt() evaluate_steps += 1 performance = int(eval_episode_reward) self.curriculum.update(performance, self.agents, time_steps, train_steps) # eval in other envs for env in self.eval_envs: self.evaluate(time_steps, env) try: env = self.curriculum.get() buffer = env.buffer self.rolloutWorker.env = env logging.info("Restoring map {}".format( self.rolloutWorker.env.map_name)) except IndexError: # done self.agents.policy.save_model(train_step) self.plt() break episodes = [] # 收集self.args.n_episodes个episodes for episode_idx in range(self.args.n_episodes): episode, train_episode_reward, _, steps = self.rolloutWorker.generate_episode( episode_idx) self.train_rewards.append(train_episode_reward) episodes.append(episode) time_steps += steps logging.info('Time_steps {}, train_episode_reward {}'.format( time_steps, train_episode_reward)) # print(_) # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起 episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) if self.args.alg.find('coma') > -1 or self.args.alg.find( 'central_v') > -1 or self.args.alg.find('reinforce') > -1: self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon) train_steps += 1 else: buffer.store_episode(episode_batch) for train_step in range(self.args.train_steps): mini_batch = buffer.sample( min(buffer.current_size, self.args.batch_size)) self.agents.train(mini_batch, train_steps) train_steps += 1 self.writer.add_scalar(f'Reward/train/', train_episode_reward, global_step=time_steps) self.writer.add_scalar(f'Reward/train/{env.map_name}', train_episode_reward, global_step=time_steps) if self.debug: for n, p in self.agents.policy.eval_rnn.named_parameters(): self.writer.add_scalar(f'eval_rnn/{n}/norm', p.norm(), global_step=time_steps) self.writer.add_scalar(f'eval_rnn/grad/{n}/norm', p.grad.norm(), global_step=time_steps) self.writer.add_scalar(f'eval_rnn/{n}/norm/{env.map_name}', p.norm(), global_step=time_steps) self.writer.add_scalar( f'eval_rnn/grad/{n}/norm/{env.map_name}', p.grad.norm(), global_step=time_steps) for n, p in self.agents.policy.eval_qmix_net.named_parameters( ): self.writer.add_scalar(f'eval_qmix_net/{n}/norm', p.norm(), global_step=time_steps) self.writer.add_scalar(f'eval_qmix_net/grad/{n}/norm', p.grad.norm(), global_step=time_steps) self.writer.add_scalar( f'eval_qmix_net/{n}/norm/{env.map_name}', p.norm(), global_step=time_steps) self.writer.add_scalar( f'eval_qmix_net/grad/{n}/norm/{env.map_name}', p.grad.norm(), global_step=time_steps) def evaluate(self, time_steps, env): win_number = 0 episode_rewards = 0 self.rolloutWorker.env = env logging.info("Evaluating in map {}".format( self.rolloutWorker.env.map_name)) for epoch in range(self.args.evaluate_epoch): _, episode_reward, win_tag, _ = self.rolloutWorker.generate_episode( epoch, evaluate=True) logging.info('Eval_epoch {}, eval_episode_reward {}'.format( epoch, episode_reward)) episode_rewards += episode_reward self.writer.add_scalar( f'Reward/eval/{self.rolloutWorker.env.map_name}', episode_reward, time_steps + epoch) if win_tag: win_number += 1 return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch def plt(self): plt.figure().set_size_inches(10, 15) plt.ylim([0, 105]) plt.cla() plt.subplot(3, 1, 1) plt.plot(range(len(self.win_rates)), self.win_rates) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('win_rates') plt.subplot(3, 1, 2) plt.plot(range(len(self.eval_episode_rewards)), self.eval_episode_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('eval_episode_rewards') plt.subplot(3, 1, 3) train_rewards = np.array_split(self.train_rewards, len(self.eval_episode_rewards)) mean_train_rewards = [np.mean(t) for t in train_rewards] plt.plot(range(len((mean_train_rewards))), mean_train_rewards) plt.xlabel('step*{}'.format(self.args.evaluate_cycle)) plt.ylabel('train_episode_rewards') plt.tight_layout() plt.savefig(self.save_path + '/plt.png', format='png') np.save(self.save_path + '/win_rates', self.win_rates) np.save(self.save_path + '/eval_rewards', self.eval_episode_rewards) np.save(self.save_path + '/train_rewards', self.train_rewards) plt.close()