def test_buffer(self): data = \ {AgentKey(0, '0-1'): AgentReplayFrame([2, 1, 2, 2, 3], [0, 1, 0], 3, False, [3, 1, 1, 2, 3]), AgentKey(0, '0-2'): AgentReplayFrame([1, 1, 3, 2, 1], [0, 1, 0], 4, False, [2, 1, 1, 2, 2]), AgentKey(1, '0-1'): AgentReplayFrame([2, 0, 3, 1, 2], [0, 1], 5, False, [3, 0, 1, 3, 4])} max_steps = 4 buffer = ReplayBuffer(max_steps) for i in range(5): buffer.push(data) self.assertEqual(buffer.length(), min(i + 1, max_steps)) sample: List[Dict[AgentKey, AgentReplayFrame]] = buffer.sample(2, norm_rews=False) for s in sample: for k, v in s.items(): self.assertEqual(v.reward, data[k].reward) sample: List[Dict[AgentKey, AgentReplayFrame]] = buffer.sample(2, norm_rews=True) for s in sample: for k, v in s.items(): self.assertEqual(v.reward, 0) avg_rewards = buffer.get_average_rewards(3) for k, v in avg_rewards.items(): self.assertEqual(v, data[k].reward)
def run(config): """ :param config: """ # model_dir = Path('./models') / config.env_id / config.model_name env = make_env(config.env_id) np.random.seed(config.seed) torch.manual_seed(config.seed) if all([hasattr(a, 'adversary') for a in env.agents]): agent_types = [ 'adversary' if a.adversary else 'agent' for a in env.agents ] else: agent_types = ['agent' for _ in env.agents] maddpg = MADDPG.init_from_env(env, agent_types, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.num_agent) for ep_i in range(config.n_episodes): print("Episodes %i of %i" % (ep_i + 1, config.n_episodes)) observations = env.reset() for et_i in range(config.episode_length): torch_observations = [ torch.from_numpy(observations[i]).float() for i in range(maddpg.num_agent) ] torch_agent_actions = maddpg.step(torch_observations) agent_actions = [ action.data.numpy() for action in torch_agent_actions ] next_observations, rewards, dones, infos = env.step(agent_actions) replay_buffer.push_data(observations, agent_actions, rewards, next_observations, dones) observations = next_observations if replay_buffer.get_size() >= config.batch_size: for a_i in range(maddpg.num_agent): sample = replay_buffer.sample(config.batch_size) maddpg.update(sample, agent_i=a_i) maddpg.update_all_agent() print("Episode rewards ") print(replay_buffer.get_episode_rewards(config.episode_length)) env.close()
def run(config): model_dir = Path('./results') / config.env_id if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run fig_dir = run_dir / 'figures' os.makedirs(str(fig_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) assert config.n_rollout_threads == 1, "For simple test, we assume the number of the environment is 1" env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed) controller = Controller.init_from_env(env=env, config=config) obs_shape, n_actions = controller.obs_shape, controller.n_actions buffer = ReplayBuffer(controller.n_agents, obs_shape, n_actions, config.episode_limit, config.buffer_size) rolloutworker = RolloutWorker(env, controller, config) train_step = 0 mean_episode_rewards = [] for ep_i in range(config.n_episodes): episode, ep_rew, mean_ep_rew = rolloutworker.generate_episode() buffer.push(episode) for step in range(config.n_train_steps): mini_batch = buffer.sample(min(len(buffer), config.batch_size)) controller.update(mini_batch, train_step) train_step += 1 # ep_rew = buffer.get_average_rewards(config.episode_limit * config.n_rollout_threads) mean_episode_rewards.append(mean_ep_rew) print("Episode {} : Total reward {} , Mean reward {}" .format(ep_i + 1, ep_rew, mean_ep_rew)) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) controller.save(str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) controller.save(str(run_dir / 'model.pt')) controller.save(str(run_dir / 'model.pt')) env.close() index = list(range(1, len(mean_episode_rewards) + 1)) plt.plot(index, mean_episode_rewards) plt.ylabel("Mean Episode Reward") plt.savefig(str(fig_dir) + '/mean_episode_reward.jpg') # plt.show() plt.close()
def run(config): # Make directory to store the results model_dir = Path('./models')/config.env_id/config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) # initialize tensorboard summary writer logger = SummaryWriter(str(log_dir)) # use provided seed torch.manual_seed(config.seed) np.random.seed(config.seed) # IDK how helpful this is if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, ) if not rnn: # TODO: this might break. code might not be modular (yet). Code works with RNN replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) else: # replay buffer obs space size is increased rnn_replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0]*history_steps for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) # This is just to store the global rewards and not for updating the policies g_storage_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0]*history_steps for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 ##################################################################################################### # START EPISODES # ##################################################################################################### for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) # List of Observations for each of the agents # E.g., For simple_spread, shape is {1,3,18} obs = env.reset() # For RNN history buffer. I know this is not modular. obs_tminus_0 = copy(obs) obs_tminus_1 = copy(obs) obs_tminus_2 = copy(obs) obs_tminus_3 = copy(obs) obs_tminus_4 = copy(obs) obs_tminus_5 = copy(obs) # # for 3 time-steps # obs_history = np.empty([1,3,54]) # next_obs_history = np.empty([1,3,54]) # For 6 time-steps (18*3 = 54) obs_history = np.empty([1,3,108]) next_obs_history = np.empty([1,3,108]) maddpg.prep_rollouts(device='cpu') # Exploration percentage remaining. IDK if this is a standard way of doing it however. explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() ################################################################################################## # START TIME-STEPS # ################################################################################################## for et_i in range(config.episode_length): # Populate current history for a in range(3): # env.nagents obs_history[0][a][:] = np.concatenate((obs_tminus_0[0][a][:], obs_tminus_1[0][a][:], obs_tminus_2[0][a][:], obs_tminus_3[0][a][:], obs_tminus_4[0][a][:], obs_tminus_5[0][a][:])) # Now, temp has history of 6 timesteps for each agent if not rnn: # TODO: This might break. Code works with RNN. !RNN not tested. # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions (from learning algorithm) as torch Variables. For simple_spread this is discrete[5] torch_agent_actions = maddpg.step(torch_obs, explore=True) else: # rearrange histories to be per agent, and convert to torch Variable rnn_torch_obs = [Variable(torch.Tensor(np.vstack(obs_history[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # TODO: for RNN, actions should condition on history (DONE) torch_agent_actions = maddpg.step(rnn_torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # print(torch_agent_actions[0].data) # rearrange actions to be per environment. For single thread, it wont really matter. actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) ############### WHICH REWARD TO USE ############## # the rewards now contain global as well as difference rewards # Keep the global for logging, and difference for updates use_diff_reward = False #TODO: THIS IS THE TYPE OF REWARD YOU USE # DIFFERENCE REWARDS d_rewards = [] for n in range(maddpg.nagents): d_rewards.append([rewards[0][n][1]]) d_rewards = [d_rewards] d_rewards = np.array(d_rewards) # GLOBAL REWARDS g_rewards = [] for n in range(maddpg.nagents): g_rewards.append([rewards[0][n][0]]) g_rewards = [g_rewards] g_rewards = np.array(g_rewards) # replace "reward" with the reward that you want to use if use_diff_reward: rewards = d_rewards else: rewards = g_rewards # Create history for next state ''' history is [t, t-1, t-2] history[0] is because [0] is for one thread ''' for a in range(3): # env.nagents next_obs_history[0][a][:] = np.concatenate((next_obs[0][a][:], obs_tminus_0[0][a][:], obs_tminus_1[0][a][:], obs_tminus_2[0][a][:], obs_tminus_3[0][a][:], obs_tminus_4[0][a][:])) # Now, next_obs_history has history of 6 timesteps for each agent the next state # for RNN, replay buffer needs to store for e.g., states=[obs_t-2, obs_t-1, obs_t] if not rnn: replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs else: # Buffer used for updates rnn_replay_buffer.push(obs_history, agent_actions, rewards, next_obs_history, dones) # push global rewards into g_replay_buffer g_storage_buffer.push(obs_history, agent_actions, g_rewards, next_obs_history, dones) # Update histories obs_tminus_5 = copy(obs_tminus_4) obs_tminus_4 = copy(obs_tminus_3) obs_tminus_3 = copy(obs_tminus_2) obs_tminus_2 = copy(obs_tminus_1) obs_tminus_1 = copy(obs_tminus_0) obs_tminus_0 = copy(next_obs) t += config.n_rollout_threads if (len(rnn_replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = rnn_replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # For plotting, use global reward achieved using difference rewards ep_rews = g_storage_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() print()
noisy_agent_actions = [] for i in range(len(agent_actions)): noise = np.random.rand(agent_actions[i].shape[0], agent_actions[i].shape[1]) tmp = agent_actions[i] * 0 tmp_action = np.argmax(agent_actions[i] + noise * 5) tmp[0][tmp_action] = 1.0 noisy_agent_actions.append(tmp) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # Here pushing observations obs = next_obs t += n_rollout_threads if (len(replay_buffer) >= batch_size and (t % steps_per_update) < n_rollout_threads): maddpg.prep_training(device='cpu') # If use GPU, here change for u_i in range(n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # Logging part ep_rews = replay_buffer.get_average_rewards(episode_length * n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i)
class DDPG: def __init__(self, state_dim, action_dim, max_action, args): self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self._init_parameters(args) self._init_nets(args) self.replay_buffer = ReplayBuffer(self.buffer_size, self.state_dim, self.action_dim) def _init_parameters(self, args): self.actor_lr = args.actor_lr self.critic_lr = args.critic_lr self.discount = args.discount self.tau = args.tau self.buffer_size = args.buffer_size self.batch_size = args.batch_size def _init_nets(self, args): self.actor = Actor(self.state_dim, self.action_dim, self.max_action, args) self.actor_t = Actor(self.state_dim, self.action_dim, self.max_action, args) self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.critic = Critic(self.state_dim, self.action_dim, args) self.critic_t = Critic(self.state_dim, self.action_dim, args) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.loss = nn.MSELoss() hard_update(self.actor_t, self.actor) hard_update(self.critic_t, self.critic) def train(self): states, n_states, actions, rewards, dones = self.replay_buffer.sample( self.batch_size) # Compute q target next_q = self.critic_t(n_states, self.actor_t(n_states)) q_target = (rewards + self.discount * (1 - dones.float()) * next_q).detach() # Compute q predict q_predict = self.critic(states, actions) # Critic update critic_loss = self.loss(q_predict, q_target) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() # Actor update actor_loss = -self.critic(states, self.actor(states)).mean() self.actor_optim.zero_grad() actor_loss.backward() actor_grad = self.actor.get_grads() self.actor_optim.step() soft_update(self.actor_t, self.actor, self.tau) soft_update(self.critic_t, self.critic, self.tau) return actor_grad
def run(config): scores_window = deque(maxlen=100) model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) # transport configuration name = 'Materials Transport' conf = { 'n_player': 2, #玩家数量 'board_width': 11, #地图宽 'board_height': 11, #地图高 'n_cell_type': 5, #格子的种类 'materials': 4, #集散点数量 'cars': 2, #汽车数 'planes': 0, #飞机数量 'barriers': 12, #固定障碍物数量 'max_step': 500, #最大步数 'game_name': name, #游戏名字 'K': 5, #每个K局更新集散点物资数目 'map_path': 'env/map.txt', #存放初始地图 'cell_range': 6, # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##? 'ob_board_width': None, # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##? 'ob_board_height': None, # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##? 'ob_cell_range': None, # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##? } env = make_parallel_env_transport(config.env_id, conf, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): score = 0 # print("Episodes %i-%i of %i" % (ep_i + 1, # ep_i + 1 + config.n_rollout_threads, # config.n_episodes)) obs = env.reset() # TODO: TO CHECK # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # print('step', et_i) # env.render() # rearrange observations to be per agent, and convert to torch Variable # print('step', et_i) # print(maddpg.nagents) torch_obs = [ Variable( torch.Tensor(np.vstack(obs[:, i])), # 沿着竖直方向将矩阵堆叠起来。 requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] ############################################ # add # actions = actions.astype(int) ############################################ # add: 前两个action joint_action = [] for i in range(2): player = [] for j in range(1): each = [0] * 11 # idx = np.random.randint(11) each[3] = 1 player.append(each) joint_action.append(player) for m in range(2): joint_action.append([actions[0][m].astype(int).tolist()]) next_obs, rewards, dones, infos = env.step(joint_action) ################################# agents_action = actions[0] ################################# replay_buffer.push(obs, agents_action, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') score += rewards[0][0] ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') scores_window.append(score) reward_epi = np.mean(scores_window) reward_epi_var = np.var(scores_window) logger.add_scalar('results/completion_window' % reward_epi, ep_i) logger.add_scalar('results/completion_window' % reward_epi_var, ep_i) print( '\r Episode {}\t Average Reward: {:.3f}\t Var Reward: {:.3f} \t '. format(ep_i, reward_epi, reward_epi_var)) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, noisy_sharing=True, noisy_SNR=config.noisy_SNR, game_id=config.env_id, est_ac=config.est_action) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 print( '#########################################################################' ) print('Adversary using: ', config.adversary_alg, 'Good agent using: ', config.agent_alg, '\n') print('Noisy SNR is: ', config.noisy_SNR) print( '#########################################################################' ) for ep_i in range(0, config.n_episodes, config.n_rollout_threads): obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') if ep_i % 5000 == 0: maddpg.lr *= 0.5 explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Episodes %i-%i of %i, rewards are: \n" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) for a_i, a_ep_rew in enumerate(ep_rews): print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') # *** perform validation every 1000 episodes. i.e. run N=10 times without exploration *** if ep_i % config.validate_every_n_eps == config.validate_every_n_eps - 1: # 假设只有一个env在跑 episodes_stats = [] info_for_one_env_among_timesteps = [] print('*' * 10, 'Validation BEGINS', '*' * 10) for valid_et_i in range(config.run_n_eps_in_validation): obs = env.reset() maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise( config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() curr_episode_stats = [] for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ ac.data.numpy() for ac in torch_agent_actions ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) info_for_one_env_among_timesteps.append(infos[0]['n']) curr_episode_stats.append(infos[0]['n']) obs = next_obs episodes_stats.append(curr_episode_stats) print('Summary statistics:') if config.env_id == 'simple_tag': # avg_collisions = sum(map(sum,info_for_one_env_among_timesteps))/config.run_n_eps_in_validation episodes_stats = np.array(episodes_stats) # print(episodes_stats.shape) # validation logging with open(f'{config.model_name}.log', 'a') as valid_logfile: valid_logwriter = csv.writer(valid_logfile, delimiter=' ') valid_logwriter.writerow( np.sum(episodes_stats, axis=(1, 2)).tolist()) avg_collisions = np.sum( episodes_stats) / episodes_stats.shape[0] print(f'Avg of collisions: {avg_collisions}') elif config.env_id == 'simple_speaker_listener': for i, stat in enumerate(info_for_one_env_among_timesteps): print(f'ep {i}: {stat}') else: raise NotImplementedError print('*' * 10, 'Validation ENDS', '*' * 10) # *** END of VALIDATION *** maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() valid_logfile.close()
def run(config): torch.set_num_threads(1) env_descr = 'map%i_%iagents_task%i' % (config.map_ind, config.num_agents, config.task_config) model_dir = Path('./models') / config.env_type / env_descr / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config, run_num) if config.nonlinearity == 'relu': nonlin = torch.nn.functional.relu elif config.nonlinearity == 'leaky_relu': nonlin = torch.nn.functional.leaky_relu if config.intrinsic_reward == 0: n_intr_rew_types = 0 sep_extr_head = True else: n_intr_rew_types = len(config.explr_types) sep_extr_head = False n_rew_heads = n_intr_rew_types + int(sep_extr_head) model = SAC.init_from_env(env, nagents=config.num_agents, tau=config.tau, hard_update_interval=config.hard_update, pi_lr=config.pi_lr, q_lr=config.q_lr, phi_lr=config.phi_lr, adam_eps=config.adam_eps, q_decay=config.q_decay, phi_decay=config.phi_decay, gamma_e=config.gamma_e, gamma_i=config.gamma_i, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, nonlin=nonlin, reward_scale=config.reward_scale, head_reward_scale=config.head_reward_scale, beta=config.beta, n_intr_rew_types=n_intr_rew_types, sep_extr_head=sep_extr_head) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, env.state_space, env.observation_space, env.action_space) intr_rew_rms = [[RunningMeanStd() for i in range(config.num_agents)] for j in range(n_intr_rew_types)] eps_this_turn = 0 # episodes so far this turn active_envs = np.ones(config.n_rollout_threads) # binary indicator of whether env is active env_times = np.zeros(config.n_rollout_threads, dtype=int) env_ep_extr_rews = np.zeros(config.n_rollout_threads) env_extr_rets = np.zeros(config.n_rollout_threads) env_ep_intr_rews = [[np.zeros(config.n_rollout_threads) for i in range(config.num_agents)] for j in range(n_intr_rew_types)] recent_ep_extr_rews = deque(maxlen=100) recent_ep_intr_rews = [[deque(maxlen=100) for i in range(config.num_agents)] for j in range(n_intr_rew_types)] recent_ep_lens = deque(maxlen=100) recent_found_treasures = [deque(maxlen=100) for i in range(config.num_agents)] meta_turn_rets = [] extr_ret_rms = [RunningMeanStd() for i in range(n_rew_heads)] t = 0 steps_since_update = 0 state, obs = env.reset() while t < config.train_time: model.prep_rollouts(device='cuda' if config.gpu_rollout else 'cpu') # convert to torch tensor torch_obs = apply_to_all_elements(obs, lambda x: torch.tensor(x, dtype=torch.float32, device='cuda' if config.gpu_rollout else 'cpu')) # get actions as torch tensors torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = apply_to_all_elements(torch_agent_actions, lambda x: x.cpu().data.numpy()) # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(int(active_envs.sum()))] try: with timeout(seconds=1): next_state, next_obs, rewards, dones, infos = env.step(actions, env_mask=active_envs) # either environment got stuck or vizdoom crashed (vizdoom is unstable w/ multi-agent scenarios) except (TimeoutError, ViZDoomErrorException, ViZDoomIsNotRunningException, ViZDoomUnexpectedExitException) as e: print("Environments are broken...") env.close(force=True) print("Closed environments, starting new...") env = make_parallel_env(config, run_num) state, obs = env.reset() env_ep_extr_rews[active_envs.astype(bool)] = 0.0 env_extr_rets[active_envs.astype(bool)] = 0.0 for i in range(n_intr_rew_types): for j in range(config.num_agents): env_ep_intr_rews[i][j][active_envs.astype(bool)] = 0.0 env_times = np.zeros(config.n_rollout_threads, dtype=int) state = apply_to_all_elements(state, lambda x: x[active_envs.astype(bool)]) obs = apply_to_all_elements(obs, lambda x: x[active_envs.astype(bool)]) continue steps_since_update += int(active_envs.sum()) if config.intrinsic_reward == 1: # if using state-visit counts, store state indices # shape = (n_envs, n_agents, n_inds) state_inds = np.array([i['visit_count_lookup'] for i in infos], dtype=int) state_inds_t = state_inds.transpose(1, 0, 2) novelties = get_count_based_novelties(env, state_inds_t, device='cpu') intr_rews = get_intrinsic_rewards(novelties, config, intr_rew_rms, update_irrms=True, active_envs=active_envs, device='cpu') intr_rews = apply_to_all_elements(intr_rews, lambda x: x.numpy().flatten()) else: intr_rews = None state_inds = None state_inds_t = None replay_buffer.push(state, obs, agent_actions, rewards, next_state, next_obs, dones, state_inds=state_inds) env_ep_extr_rews[active_envs.astype(bool)] += np.array(rewards) env_extr_rets[active_envs.astype(bool)] += np.array(rewards) * config.gamma_e**(env_times[active_envs.astype(bool)]) env_times += active_envs.astype(int) if intr_rews is not None: for i in range(n_intr_rew_types): for j in range(config.num_agents): env_ep_intr_rews[i][j][active_envs.astype(bool)] += intr_rews[i][j] over_time = env_times >= config.max_episode_length full_dones = np.zeros(config.n_rollout_threads) for i, env_i in enumerate(np.where(active_envs)[0]): full_dones[env_i] = dones[i] need_reset = np.logical_or(full_dones, over_time) # create masks ONLY for active envs active_over_time = env_times[active_envs.astype(bool)] >= config.max_episode_length active_need_reset = np.logical_or(dones, active_over_time) if any(need_reset): try: with timeout(seconds=1): # reset any environments that are past the max number of time steps or done state, obs = env.reset(need_reset=need_reset) # either environment got stuck or vizdoom crashed (vizdoom is unstable w/ multi-agent scenarios) except (TimeoutError, ViZDoomErrorException, ViZDoomIsNotRunningException, ViZDoomUnexpectedExitException) as e: print("Environments are broken...") env.close(force=True) print("Closed environments, starting new...") env = make_parallel_env(config, run_num) state, obs = env.reset() # other envs that were force reset (rest taken care of in subsequent code) other_reset = np.logical_not(need_reset) env_ep_extr_rews[other_reset.astype(bool)] = 0.0 env_extr_rets[other_reset.astype(bool)] = 0.0 for i in range(n_intr_rew_types): for j in range(config.num_agents): env_ep_intr_rews[i][j][other_reset.astype(bool)] = 0.0 env_times = np.zeros(config.n_rollout_threads, dtype=int) else: state, obs = next_state, next_obs for env_i in np.where(need_reset)[0]: recent_ep_extr_rews.append(env_ep_extr_rews[env_i]) meta_turn_rets.append(env_extr_rets[env_i]) if intr_rews is not None: for j in range(n_intr_rew_types): for k in range(config.num_agents): # record intrinsic rewards per step (so we don't confuse shorter episodes with less intrinsic rewards) recent_ep_intr_rews[j][k].append(env_ep_intr_rews[j][k][env_i] / env_times[env_i]) env_ep_intr_rews[j][k][env_i] = 0 recent_ep_lens.append(env_times[env_i]) env_times[env_i] = 0 env_ep_extr_rews[env_i] = 0 env_extr_rets[env_i] = 0 eps_this_turn += 1 if eps_this_turn + active_envs.sum() - 1 >= config.metapol_episodes: active_envs[env_i] = 0 for i in np.where(active_need_reset)[0]: for j in range(config.num_agents): # len(infos) = number of active envs recent_found_treasures[j].append(infos[i]['n_found_treasures'][j]) if eps_this_turn >= config.metapol_episodes: if not config.uniform_heads and n_rew_heads > 1: meta_turn_rets = np.array(meta_turn_rets) if all(errms.count < 1 for errms in extr_ret_rms): for errms in extr_ret_rms: errms.mean = meta_turn_rets.mean() extr_ret_rms[model.curr_pol_heads[0]].update(meta_turn_rets) for i in range(config.metapol_updates): model.update_heads_onpol(meta_turn_rets, extr_ret_rms, logger=logger) pol_heads = model.sample_pol_heads(uniform=config.uniform_heads) model.set_pol_heads(pol_heads) eps_this_turn = 0 meta_turn_rets = [] active_envs = np.ones(config.n_rollout_threads) if any(need_reset): # reset returns state and obs for all envs, so make sure we're only looking at active state = apply_to_all_elements(state, lambda x: x[active_envs.astype(bool)]) obs = apply_to_all_elements(obs, lambda x: x[active_envs.astype(bool)]) if (len(replay_buffer) >= max(config.batch_size, config.steps_before_update) and (steps_since_update >= config.steps_per_update)): steps_since_update = 0 print('Updating at time step %i' % t) model.prep_training(device='cuda' if config.use_gpu else 'cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu, state_inds=(config.intrinsic_reward == 1)) if config.intrinsic_reward == 0: # no intrinsic reward intr_rews = None state_inds = None else: sample, state_inds = sample novelties = get_count_based_novelties( env, state_inds, device='cuda' if config.use_gpu else 'cpu') intr_rews = get_intrinsic_rewards(novelties, config, intr_rew_rms, update_irrms=False, device='cuda' if config.use_gpu else 'cpu') model.update_critic(sample, logger=logger, intr_rews=intr_rews) model.update_policies(sample, logger=logger) model.update_all_targets() if len(recent_ep_extr_rews) > 10: logger.add_scalar('episode_rewards/extrinsic/mean', np.mean(recent_ep_extr_rews), t) logger.add_scalar('episode_lengths/mean', np.mean(recent_ep_lens), t) if config.intrinsic_reward == 1: for i in range(n_intr_rew_types): for j in range(config.num_agents): logger.add_scalar('episode_rewards/intrinsic%i_agent%i/mean' % (i, j), np.mean(recent_ep_intr_rews[i][j]), t) for i in range(config.num_agents): logger.add_scalar('agent%i/n_found_treasures' % i, np.mean(recent_found_treasures[i]), t) logger.add_scalar('total_n_found_treasures', sum(np.array(recent_found_treasures[i]) for i in range(config.num_agents)).mean(), t) if t % config.save_interval < config.n_rollout_threads: model.prep_training(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_%isteps.pt' % (t + 1))) model.save(run_dir / 'model.pt') t += active_envs.sum() model.prep_training(device='cpu') model.save(run_dir / 'model.pt') logger.close() env.close(force=(config.env_type == 'vizdoom'))
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) ##################### INITIALIZE FROM SAVED? ########################### if init_from_saved: if model_path is not None: maddpg = MADDPG.init_from_save(model_path) print("Initialized from saved model") # -------------------------------------------------------------------- # else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) # used for learning (updates) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # This is just to store the global rewards and not for updating the policies g_storage_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions, maddpg) ''' Reward Shaping using D++, D. The rewards now contain global as well as shaped rewards Keep the global for logging, and use the shaped rewards for updates ''' # Choose which reward to use use_dpp = True # DIFFERENCE REWARDS d_rewards = [] for n in range(maddpg.nagents): d_rewards.append([rewards[0][n][1]]) d_rewards = [d_rewards] d_rewards = np.array(d_rewards) # GLOBAL REWARDS g_rewards = [] for n in range(maddpg.nagents): g_rewards.append([rewards[0][n][0]]) g_rewards = [g_rewards] g_rewards = np.array(g_rewards) if use_dpp: rewards = d_rewards else: rewards = g_rewards # ----------------------------------------------------------- # # Buffer used for updates replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # push global rewards into g_replay_buffer for plotting g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # Take out global reward from g_storage_buffer ep_rews = g_storage_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_name / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) os.system("cp shape.txt {}".format(run_dir)) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) #training时的线程数 if not config.use_cuda: torch.set_num_threads(config.n_training_threads) #env并行采样的进程 env = make_parallel_env(config.num_agents, config.n_rollout_threads, run_num, config.shape_file) #''' maddpg = MADDPG.init_from_env(env=env, agent_alg=config.agent_alg, cripple_alg=config.cripple_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, discrete_action=config.discrete_action) #''' #maddpg = MADDPG.init_from_save(model_dir/'run1'/'model.pt') replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 a_loss = [] c_loss = [] rewss = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') # show for the first time explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() #if config.display: # for env_show in env.envs: # env_show.render('human', close=False) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] #actions = [np.array([i.tolist().index(1.0) for i in action]) for action in actions_one_hot] for i in actions: # print(i) for j in i: j[1] *= np.pi #print(actions[0]) next_obs, rewards, dones, infos = env.step(actions) #print(len(agent_actions),len(next_obs)) #if config.display: # for env_show in env.envs: # env_show.render('human', close=False) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): #print(t) if config.use_cuda: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_cuda, norm_rews=True) maddpg.update(sample, a_i, logger=logger, actor_loss_list=a_loss, critic_loss_list=c_loss) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) rewss.append(ep_rews) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) # print('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(str(run_dir / 'incremental'), exist_ok=True) maddpg.save( str(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1)))) maddpg.save(str(run_dir / 'model.pt')) maddpg.save(str(run_dir / 'model.pt')) env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() '''
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = make_model(env, config) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) recent_reliable_obs = [[None for i in range(model.nagents)] for e in range(config.n_rollout_threads)] print("Start train Agents...") t = 0 steps, avg_ep_rew = 0, 0 t_start = time.time() each_rws = [] large_rws = [] small_rws = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): model.prep_rollouts(device='cpu') obs, validity = env.reset() obs = get_reliable_obs(model, obs, recent_reliable_obs, validity) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos, next_validity = env.step(actions) next_obs = get_reliable_obs(model, next_obs, recent_reliable_obs, next_validity) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs validity = next_validity t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) steps += 1 for a_i, a_ep_rew in enumerate(ep_rews): avg_ep_rew += a_ep_rew logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) each_rws.append(ep_rews[0]) small_rws.append(sum(ep_rews)) large_rws.append(sum(ep_rews) * config.episode_length) logger.add_scalar('large_rewards', large_rws[-1], ep_i) logger.add_scalar('small_rewards', small_rws[-1], ep_i) if ep_i > 1 and (ep_i + 1) % config.save_interval < config.n_rollout_threads: print("Episodes %i of %i" % (ep_i + config.n_rollout_threads, config.n_episodes), end=' ') print('mean_episode_rewards: %f, time: %f' % ( avg_ep_rew / steps * config.episode_length, round(time.time() - t_start, 3))) t_start = time.time() steps, avg_ep_rew = 0, 0 model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) if (ep_i + 1) % (config.save_interval * 5) < config.n_rollout_threads: model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') large_rew_file_name = log_dir / (config.model_name + '_large_rewards.pkl') with open(large_rew_file_name, 'wb') as fp: pickle.dump(large_rws, fp) small_rew_file_name = log_dir / (config.model_name + '_small_rewards.pkl') with open(small_rew_file_name, 'wb') as fp: pickle.dump(small_rws, fp) each_rew_file_name = log_dir / (config.model_name + '_each_rewards.pkl') with open(each_rew_file_name, 'wb') as fp: pickle.dump(each_rws, fp) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close() print("Agents train completion!\n")
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) # model_path = (Path('./models') / config.env_id / config.model_name / # ('run%i' % config.run_num)) # model_path = model_path / 'model.pt' # maddpg = MADDPG.init_runner_from_save(model_path) maddpg = MADDPG.init_from_env_with_delay(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, delay_step = 1) delay_step = 1 replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] + delay_step*2 for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() # zero_agent_actions = [[[0, 0]].data.numpy() for _ in range(maddpg.nagents-1)] zero_agent_actions = [np.array([0.0, 0.0]) for _ in range(maddpg.nagents)] last_agent_actions = [zero_agent_actions for _ in range(delay_step)] for a_i, agent_obs in enumerate(obs[0]): for _ in range(len(last_agent_actions)): obs[0][a_i] = np.append(agent_obs, last_agent_actions[_][a_i]) for et_i in range(config.episode_length): # print(obs) # agent_obs = np.append(agent_obs, last_agent_actions[_][a_i]) # print(np.concatenate(obs[0], np.array(last_agent_actions).T)) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # augment the obs # get actions as torch Variables # print(torch_obs) torch_agent_actions = maddpg.step(torch_obs, explore=True) # print(torch_agent_actions) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # print('1', agent_actions) # actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # rearrange actions to be per environment if delay_step == 0: actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] else: agent_actions_tmp = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)][0] # print('2', agent_actions_tmp) actions = last_agent_actions[0] # print('3', actions) # print('4', actions) last_agent_actions = last_agent_actions[1:] last_agent_actions.append(agent_actions_tmp) # print('3', last_agent_actions) # print('4', last_agent_actions) # print('5', actions) actions = [actions] next_obs, rewards, dones, infos = env.step(actions) # print('6', actions) for a_i, agent_obs in enumerate(next_obs[0]): for _ in range(len(last_agent_actions)): if a_i == 2: next_obs[0][a_i] = np.append(agent_obs, 4*last_agent_actions[_][a_i]) else: next_obs[0][a_i] = np.append(agent_obs, 3*last_agent_actions[_][a_i]) # print('3', agent_actions) agent_actions[0] = agent_actions[0]*3 agent_actions[1] = agent_actions[1]*3 agent_actions[2] = agent_actions[1]*4 # print('2',agent_actions) # print('4', obs) # print('5', next_obs) # print('1',agent_actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents - 1): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) # maddpg.update_all_targets() maddpg.update_adversaries() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'reward': a_ep_rew}, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if config.load_adv == True: model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) model_path = model_path / 'model.pt' maddpg = MADDPG.init_from_env_with_runner_delay_unaware( env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, file_name=model_path) else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 delay_step = config.delay_step for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() maddpg.prep_rollouts(device='gpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() if config.env_id == 'simple_speaker_listener': zero_agent_actions = [ np.array([[0, 0, 0]]), np.array([[0, 0, 0, 0, 0]]) ] elif config.env_id == 'simple_spread': zero_agent_actions = [ np.array([[0.0, 0.0, 0.0, 0.0, 0.0]]) for _ in range(maddpg.nagents) ] elif config.env_id == 'simple_tag': zero_agent_actions = [ np.array([0.0, 0.0]) for _ in range(maddpg.nagents) ] last_agent_actions = [zero_agent_actions for _ in range(delay_step)] for et_i in range(config.episode_length): torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] if config.load_adv: if delay_step == 0: actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] else: agent_actions_tmp = [[ ac[i] for ac in agent_actions ] for i in range(config.n_rollout_threads)][0][:] actions = last_agent_actions[0] actions.append(agent_actions_tmp[-1]) last_agent_actions = last_agent_actions[1:] last_agent_actions.append(agent_actions_tmp[:2]) actions = [actions] next_obs, rewards, dones, infos = env.step( copy.deepcopy(actions)) else: if delay_step == 0: actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] else: actions = [[ac[i] for ac in last_agent_actions[0]] for i in range(config.n_rollout_threads)] last_agent_actions.pop(0) last_agent_actions.append(agent_actions) next_obs, rewards, dones, infos = env.step( copy.deepcopy(actions)) print('1', obs, agent_actions, rewards, next_obs, dones) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): if config.load_adv: for a_i in range(maddpg.nagents - 1): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) # maddpg.update_all_targets() maddpg.update_adversaries() else: for a_i in range( maddpg.nagents): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='gpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'reward': a_ep_rew}, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): device = torch.device('cuda' if USE_CUDA else 'cpu') print('Using device:', device) if device.type == 'cuda': print(torch.cuda.get_device_name(0)) print('Memory Usage:') print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB') print('Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB') model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) print(str(log_dir)) logger = SummaryWriter(str(log_dir)) #logger = None f = open(run_dir / "hyperparametrs.txt","w+") f.write(str(config)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action, config.benchmark) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, stochastic = config.stochastic, commonCritic = config.commonCritic, gasil = config.gasil, dlr = config.dlr, lambda_disc = config.lambda_disc, batch_size_disc = config.batch_size_disc, dynamic=config.dynamic) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) expert_replay_buffer = PriorityReplayBuffer(config.expert_buffer_length, config.episode_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 agent_info = [[[] for i in range(config.n_rollout_threads)]] reward_info = [] total_returns = [] eval_trajectories = [] expert_average_returns = [] trajectories = [] durations = [] start_time = time.time() expert_trajectories = [] evaluation_rewards = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) if ep_i%100 == 0: mins = (time.time() - start_time)/60 durations.append(mins) print(mins, "minutes") start_time = time.time() obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() current_episode = [[] for i in range(config.n_rollout_threads)] current_trajectory = [[] for i in range(config.n_rollout_threads)] current_entities = [] total_dense = None if config.store_traj: cur_state_ent = env.getStateEntities() for i in range(config.n_rollout_threads): current_entities.append(cur_state_ent[i]) cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if config.store_traj: cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) for i in range(config.n_rollout_threads): current_episode[i].append([obs[i], actions[i]]) if config.benchmark: #Fix this for i, info in enumerate(infos): agent_info[-1][i].append(info['n']) if et_i == 0: total_dense = rewards else: total_dense = total_dense + rewards replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads and ((expert_replay_buffer.num_traj*config.episode_length >= config.batch_size_disc) == (maddpg.gasil))): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') if maddpg.gasil: for update_i in range(config.num_disc_updates): sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False) sample_expert = expert_replay_buffer.sample(config.batch_size_disc, to_gpu=USE_CUDA) maddpg.gasil_disc_update(sample_normal, sample_expert, 0, logger=logger, num_disc_permutations = config.num_disc_permutations) for update_i in range(config.num_AC_updates): sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False) maddpg.gasil_AC_update(sample_normal, 0, episode_num = ep_i, logger=logger, num_AC_permutations = config.num_AC_permutations) else: for update_i in range(config.num_AC_updates): sample_normal = replay_buffer.sample(config.batch_size,to_gpu=USE_CUDA, norm_rews = False) maddpg.update(sample_normal, 0, logger=logger, num_AC_permutations = config.num_AC_permutations) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') total_returns.append(total_dense) if maddpg.gasil: expert_replay_buffer.push(current_episode, total_dense, config.n_rollout_threads, current_entities, current_trajectory, config.store_traj) expert_average_returns.append(expert_replay_buffer.get_average_return()) if config.store_traj: for i in range(config.n_rollout_threads): trajectories.append([current_entities[i], current_trajectory[i]]) ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalars('agent%i/rew' % a_i, {'mean_episode_rewards': a_ep_rew}, ep_i) logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) #save mean episode rewards #save benchmarking data agent_info.append([[] for i in range(config.n_rollout_threads)]) reward_info.append(ep_rews) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') #save the trajectories in the expert replay buffer trajec = expert_replay_buffer.get_trajectories() if config.store_traj: expert_trajectories.append(trajec) if ep_i % config.eval_interval < config.n_rollout_threads: current_eval = [] current_trajectories = [] for ep_i_eval in range(0, config.n_eval_episodes, config.n_rollout_threads): obs = env.reset() total_eval = None maddpg.prep_rollouts(device='cpu') if config.store_traj: current_trajectory = [[] for i in range(config.n_rollout_threads)] current_entities = [] cur_state_ent = env.getStateEntities() for i in range(config.n_rollout_threads): current_entities.append(cur_state_ent[i]) cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) for et_i in range(config.episode_length): torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] torch_agent_actions = maddpg.step(torch_obs, explore=False) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) if config.store_traj: cur_state = env.getState() for i in range(config.n_rollout_threads): current_trajectory[i].append(cur_state[i]) if et_i == 0: total_eval = rewards else: total_eval = total_eval + rewards obs = next_obs current_eval.append(total_eval) if config.store_traj: for i in range(config.n_rollout_threads): current_trajectories.append([current_entities[i], current_trajectory[i]]) if config.store_traj: eval_trajectories.append(current_trajectories) evaluation_rewards.append(current_eval)
def run(config): model_dir = Path('./models') / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) env = gym.make("intersection-multiagent-v0") maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 delay_step = config.delay_step for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='gpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() agent_obs = [] for i in range(4): agent_obs.append( np.array([ obs[i % 4], obs[(i + 1) % 4], obs[(i + 2) % 4], obs[(i + 3) % 4] ]).flatten()) obs = np.array([agent_obs]) zero_agent_actions = [1, 1, 1, 1] last_agent_actions = [zero_agent_actions for _ in range(delay_step)] for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ torch.FloatTensor(np.vstack(obs[:, i])) for i in range(maddpg.nagents) ] # get actions as torch Variables # print(obs) torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # print(agent_actions) # rearrange actions to be per environment if delay_step == 0: actions = [np.argmax(agent_actions[i][0]) for i in range(4)] else: future_actions = [ np.argmax(agent_actions[i][0]) for i in range(4) ] actions = last_agent_actions[0] last_agent_actions = last_agent_actions[1:] last_agent_actions.append(future_actions) next_obs, rewards, dones, infos = env.step(actions) # print(rewards) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) if dones[0][0]: break obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range( maddpg.nagents): #do not update the runner sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='gpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'reward': a_ep_rew}, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
class Agent(): def __init__(self, num_agents, state_size, action_size, opts): self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.opts = opts # Actor Network self.actor_local = ActorNet(state_size, action_size, fc1_units=opts.a_fc1, fc2_units=opts.a_fc2).to(opts.device) self.actor_target = ActorNet(state_size, action_size, fc1_units=opts.a_fc1, fc2_units=opts.a_fc2).to(opts.device) self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), lr=opts.actor_lr) # Critic Network self.critic_local = CriticNet(state_size, action_size, fc1_units=opts.c_fc1, fc2_units=opts.c_fc2).to(opts.device) self.critic_target = CriticNet(state_size, action_size, fc1_units=opts.c_fc1, fc2_units=opts.c_fc2).to(opts.device) self.critic_optimizer = torch.optim.Adam( self.critic_local.parameters(), lr=opts.critic_lr, weight_decay=opts.critic_weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), opts.random_seed) self.step_idx = 0 # Replay memory self.memory = ReplayBuffer(action_size, opts.buffer_size, opts.batch_size, opts.random_seed, opts.device) def step(self, state, action, reward, next_state, done): for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) self.step_idx += 1 is_learn_iteration = (self.step_idx % self.opts.learn_every) == 0 is_update_iteration = (self.step_idx % self.opts.update_every) == 0 if len(self.memory) > self.opts.batch_size: if is_learn_iteration: experiences = self.memory.sample() self.learn(experiences, self.opts.gamma) if is_update_iteration: soft_update(self.critic_local, self.critic_target, self.opts.tau) soft_update(self.actor_local, self.actor_target, self.opts.tau) def act(self, state): state = torch.from_numpy(state).float().to(self.opts.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() action += self.noise.sample() return np.clip(action, self.opts.minimum_action_value, self.opts.maximum_action_value) def save(self): torch.save(self.critic_local.state_dict(), self.opts.output_data_path + "critic_local.pth") torch.save(self.critic_target.state_dict(), self.opts.output_data_path + "critic_target.pth") torch.save(self.actor_local.state_dict(), self.opts.output_data_path + "actor_local.pth") torch.save(self.actor_target.state_dict(), self.opts.output_data_path + "actor_target.pth") def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences states = tensor(states, self.opts.device) actions = tensor(actions, self.opts.device) rewards = tensor(rewards, self.opts.device) next_states = tensor(next_states, self.opts.device) mask = tensor(1 - dones, self.opts.device) # Update critic actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * mask) # Compute & minimize critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Update actor actions_pred = self.actor_local(states) # Compute & minimize critic loss actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step()
def run(halite_env: BaseEnv, load_latest: bool=False): config = halite_env.config model_path, run_num, run_dir, log_dir = run_setup(config.model_name, get_latest_model=load_latest) os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) # Build MAAC model if model_path is None: model = AttentionSAC(halite_env.agent_type_topologies, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) else: model = AttentionSAC.init_from_save(model_path, load_critic=True) # Build replay buffer replay_buffer = ReplayBuffer(config.buffer_length) prev_time = time.perf_counter() t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): curr_time = time.perf_counter() print("Episodes %i-%i of %i (%is)" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes, (curr_time - prev_time))) model.prep_rollouts(device='cpu') game_reward = halite_env.simulate(lambda o: model.step(o, explore=True), replay_buffer) t += config.n_rollout_threads if (replay_buffer.length() >= config.batch_size and (t % config.games_per_update) < config.n_rollout_threads): print("Training") if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample: List[Dict[AgentKey, AgentReplayFrame]] = replay_buffer.sample(config.batch_size) # print("Original sample size", len(sample)) # print("Preprocessing to batch structure") sample: Dict[AgentKey, BatchedAgentReplayFrame] = preprocess_to_batch(sample, to_gpu=config.use_gpu) # print("Filtered sample size", len(sample)) # if len(sample) < 5: # print("Sample size keys:", sample.keys()) # print("Updating model critic") model.update_critic(sample, logger=logger) # print("Updating model policies") model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for k, v in ep_rews.items(): logger.add_scalar('agent%s/mean_episode_rewards' % str(k), v, ep_i) logger.add_scalar("global_env_rewards", game_reward, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: print("Saving") model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') print("run_dir", run_dir) prev_time = curr_time model.save(run_dir / 'model.pt') logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def train(config, dir_manager=None, logger=None, pbar="default_pbar"): # A few safety checks check_training_args(config) # Creates a directory manager that encapsulates our directory-tree structure if dir_manager is None: dir_manager = DirectoryManager(agent_alg=config.agent_alg, env_name=config.env_name, desc=config.desc, seed=config.seed) dir_manager.create_directories() # Creates logger and prints config if logger is None: logger = create_logger('MASTER', config.log_level, dir_manager.seed_dir / 'logger.out') logger.debug(config_to_str(config)) # Creates a progress-bar if type(pbar) is str: if pbar == "default_pbar": pbar = tqdm() if pbar is not None: pbar.n = 0 pbar.desc += f'{dir_manager.storage_dir.name}/{dir_manager.experiment_dir.name}/{dir_manager.seed_dir.name}' pbar.total = config.n_episodes # Encapsulates in a dict all user-defined params that concern the world (scenario.make_world()) world_params = {} world_params['use_dense_rewards'] = config.use_dense_rewards if config.env_name == 'chase': if config.n_preys is not None: world_params['n_preys'] = config.n_preys if config.n_preds is not None: world_params['n_preds'] = config.n_preds if config.prey_variance is not None: world_params['prey_variance'] = config.prey_variance if config.individual_reward is not None: world_params['individual_reward'] = config.individual_reward elif config.env_name == 'gather': if config.n_agents is not None: world_params['n_agents'] = config.n_agents elif config.env_name == 'intersection': if config.n_agents is not None: world_params['n_agents'] = config.n_agents elif config.env_name == 'bounce': world_params['episode_length'] = config.episode_length if config.line_length is not None: world_params['line_length'] = config.line_length elif config.env_name == 'compromise': if config.line_length is not None: world_params['line_length'] = config.line_length if config.show_all_landmarks is not None: world_params['show_all_landmarks'] = config.show_all_landmarks elif config.env_name == 'imitation': if config.staged is not None: world_params['staged'] = config.staged if config.set_trap is not None: world_params['set_trap'] = config.set_trap elif config.env_name == 'intersection': if config.by_stander is not None: world_params['by_stander'] = config.by_stander elif config.env_name == 'spread': if config.n_agents is not None: world_params['n_agents'] = config.n_agents if config.shuffle_landmarks is not None: world_params['shuffle_landmarks'] = config.shuffle_landmarks if config.color_objects is not None: world_params['color_objects'] = config.color_objects if config.small_agents is not None: world_params['small_agents'] = config.small_agents save_dict_to_json(world_params, str(dir_manager.seed_dir / 'world_params.json')) # Encapsulates in a dict all user-defined params that concern the environment (multiagent.environment.MultiAgentEnv) env_params = {} env_params['env_name'] = config.env_name env_params['use_max_speed'] = config.use_max_speed save_dict_to_json(env_params, str(dir_manager.seed_dir / 'env_params.json')) # Sets the random seeds (for reproducibility) set_seeds(config.seed) # Initializes environments env = make_parallel_env(config.env_name, config.n_rollout_threads, config.seed, use_discrete_action=config.use_discrete_action, use_max_speed=config.use_max_speed, world_params=world_params) if not config.use_cuda: torch.set_num_threads(config.n_training_threads) # Initialize the algo algorithm = init_from_config(env, config, logger) replay_buffer = ReplayBuffer(max_steps=config.buffer_length, num_agents=algorithm.nagents, obs_dims=[obsp.shape[0] for obsp in env.observation_space], ac_dims=[acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) # Creates recorders and stores basic info regarding agent types os.makedirs(dir_manager.recorders_dir, exist_ok=True) train_recorder = algorithm.create_train_recorder() train_recorder.tape['agent_colors'] = env.agent_colors # Saves initial model best_eval_reward = 0. best_model = "model_ep0_best.pt" current_model = "model_ep0.pt" algorithm.save(dir_manager.seed_dir / current_model) algorithm.save(dir_manager.seed_dir / best_model) # Initializes step and episode counters step_i = 0 ep_steps = np.zeros(shape=(config.n_rollout_threads,), dtype=np.int) ep_dones = 0 ep_recorders = [EpisodeRecorder(stuff_to_record=['reward']) for _ in range(config.n_rollout_threads)] obs = env.reset() algorithm.set_exploration(begin_decay_proportion=config.begin_exploration_decay, n_episodes=config.n_episodes, end_decay_proportion=config.end_exploration_decay, initial_scale=config.init_noise_scale, final_scale=config.final_noise_scale, current_episode=ep_dones) # EPISODES LOOP while ep_dones < config.n_episodes: start_time = time.time() # ENVIRONMENT STEP # convert observations to torch Variable torch_obs = [Variable(torch.Tensor(obs[:, i]), requires_grad=False) for i in range(algorithm.nagents)] # get actions as torch Variables torch_agent_actions = algorithm.select_action(torch_obs, is_exploring=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] # makes one step in the environment next_obs, rewards, dones, infos = env.step(actions) # put transitions in the memory buffer replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # saves relevant info in episode recorders for i in range(config.n_rollout_threads): ep_recorders[i].add_step(obs[i], actions[i], rewards[i], next_obs[i]) # ending step obs = next_obs step_i += config.n_rollout_threads ep_steps += 1 # LEARNING STEP if (len(replay_buffer) >= config.batch_size * config.warmup) \ and (step_i % config.steps_per_update) < config.n_rollout_threads: # Prepares models to training if config.use_cuda: algorithm.prep_training(device='gpu') else: algorithm.prep_training(device='cpu') # Performs one algorithm update sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_cuda, normalize_rewards=False) algorithm.update(sample, train_recorder) # Update target networks algorithm.update_all_targets() # Prepares models to go back in rollout phase algorithm.prep_rollouts(device='cpu') # EPISODE ENDINGS episodes_over = dones | (ep_steps >= config.episode_length) if any(episodes_over): if pbar is not None: pbar.update(sum(episodes_over)) for env_i, is_over in enumerate(episodes_over): if is_over: ep_dones += 1 ep_steps[env_i] = 0 # Reset environments obs[env_i] = env.reset(env_i=env_i) # Summarizes episode metrics train_recorder.append('total_reward', ep_recorders[env_i].get_total_reward()) # Reinitialise episode recorder ep_recorders[env_i] = EpisodeRecorder(stuff_to_record=['reward']) # Printing if one third of training is completed if (ep_dones - 1) % (config.n_episodes // 3) == 0 and ep_dones != config.n_episodes: step_time = time.time() - start_time logger.info(f"Episode {ep_dones}/{config.n_episodes}, " f"speed={round_to_two(float(config.n_rollout_threads) / step_time)}steps/s") # Sets exploration noise algorithm.set_exploration(begin_decay_proportion=config.begin_exploration_decay, n_episodes=config.n_episodes, end_decay_proportion=config.end_exploration_decay, initial_scale=config.init_noise_scale, final_scale=config.final_noise_scale, current_episode=ep_dones) # BOOK-KEEPING if ep_dones % config.episodes_per_save < config.n_rollout_threads: # Model checkpoints if config.save_incrementals: os.makedirs(dir_manager.incrementals_dir, exist_ok=True) algorithm.save(dir_manager.incrementals_dir / ('model_ep%i.pt' % (ep_dones + 1))) os.remove(dir_manager.seed_dir / current_model) current_model = f"model_ep{ep_dones}.pt" algorithm.save(dir_manager.seed_dir / current_model) logger.debug('Saving model checkpoint') # Current model evaluation (run episodes without exploration) if config.n_evaluation_episodes > 0: logger.debug(f'Evaluating model for {config.n_evaluation_episodes} episodes') set_seeds(config.evaluation_seed) # fixed seed for evaluation env.seed(config.evaluation_seed) eval_config = get_evaluation_args(overwritten_args="") eval_config.storage_name = dir_manager.storage_dir.name eval_config.experiment_num = int(dir_manager.experiment_dir.stem.strip('experiment')) eval_config.seed_num = int(dir_manager.seed_dir.stem.strip('seed')) eval_config.render = False eval_config.n_episodes = config.n_evaluation_episodes eval_config.last_model = True eval_config.episode_length = config.episode_length eval_reward = np.vstack(evaluate(eval_config)) train_recorder.append('eval_episodes', ep_dones) train_recorder.append('eval_total_reward', eval_reward) if eval_reward.mean() > best_eval_reward: logger.debug("New best model") os.remove(dir_manager.seed_dir / best_model) best_model = f"model_ep{ep_dones}_best.pt" algorithm.save(dir_manager.seed_dir / best_model) best_eval_reward = eval_reward.mean() set_seeds(config.seed + ep_dones) env.seed(config.seed + ep_dones) # Graphs checkpoints logger.debug('Saving recorder checkpoints and graphs') train_recorder.save(dir_manager.recorders_dir / 'train_recorder.pkl') # Saving graphs if len(train_recorder.tape['actor_loss']) > 0: algorithm.save_training_graphs(train_recorder=train_recorder, save_dir=dir_manager.seed_dir) # Saves model one last time and close the environment os.remove(dir_manager.seed_dir / current_model) current_model = f"model_ep{ep_dones}.pt" algorithm.save(dir_manager.seed_dir / current_model) env.close()
def run(config): numWolves = 4 numSheep = 1 numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks sheepMaxSpeed = 1.3 wolfMaxSpeed = 1.0 blockMaxSpeed = None entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 10 isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisionPunishment=collisionReward) individualRewardWolf = 0 rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, individualRewardWolf) reshapeAction = ReshapeAction() costActionRatio = 0 getActionCost = GetActionCost(costActionRatio, reshapeAction, individualCost=True) getWolvesAction = lambda action: [action[wolfID] for wolfID in wolvesID] rewardWolfWithActionCost = lambda state, action, nextState: np.array( rewardWolf(state, action, nextState)) - np.array( getActionCost(getWolvesAction(action))) rewardFunc = lambda state, action, nextState: \ list(rewardWolfWithActionCost(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False] * numAgents initObsForParams = observe(reset()) envObservationSpace = [ initObsForParams[obsID].shape for obsID in range(len(initObsForParams)) ] worldDim = 2 envActionSpace = [ spaces.Discrete(worldDim * 2 + 1) for agentID in range(numAgents) ] model_dir = os.path.join(dirName, 'models', config.env_id, config.model_name) model = AttentionSAC.init_from_env( envActionSpace, envObservationSpace, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, #128 critic_hidden_dim=config.critic_hidden_dim, #128 attend_heads=config.attend_heads, #4 reward_scale=config.reward_scale) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [ obsp[0] if isinstance(obsp, tuple) else obsp.shape[0] for obsp in envObservationSpace ], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in envActionSpace ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #12 print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) state = reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): obs = observe(state) obs = np.array([obs]) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] action = actions[0] nextState = transit(state, action) next_obs = np.array([observe(nextState)]) rewards = np.array([rewardFunc(state, action, nextState)]) dones = np.array([isTerminal(nextState)]) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) state = nextState t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads ): # 100 steps across rollouts -> 4 updates model.prep_training(device='cpu') for u_i in range(config.num_updates): #4 sample = replay_buffer.sample(config.batch_size) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device='cpu') if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') pathIncremental = os.path.join(model_dir, 'incremental') if not os.path.exists(pathIncremental): os.makedirs(pathIncremental) model.save( os.path.join(pathIncremental, ('model_ep%i.pt' % (ep_i + 1)))) model.save(os.path.join(model_dir, 'model.pt'))
def run(config): model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # run_num = 1 # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # run_num = 1 # else: # run_num = max(exst_run_nums) + 1 run_num = 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir,exist_ok=True) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) model = AttentionSAC.init_from_env(env, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, critic_hidden_dim=config.critic_hidden_dim, attend_heads=config.attend_heads, reward_scale=config.reward_scale) replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config.episode_length, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
class SAC: # TODO: # scale action # load save def __init__( self, env, learning_rate: float = 3e-4, tau: float = 0.005, buffer_size: int = 1e6, alpha: Union[float, str] = 'auto', net_arch: List = [256, 256], batch_size: int = 256, num_q_nets: int = 2, m_sample: int = None, # None == SAC, 2 == REDQ learning_starts: int = 100, gradient_updates: int = 1, gamma: float = 0.99, mbpo: bool = False, dynamics_rollout_len: int = 1, rollout_dynamics_starts: int = 5000, real_ratio: float = 0.05, project_name: str = 'sac', experiment_name: Optional[str] = None, log: bool = True, wandb: bool = True, device: Union[th.device, str] = 'auto'): self.env = env self.observation_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.learning_rate = learning_rate self.tau = tau self.gamma = gamma self.buffer_size = buffer_size self.num_q_nets = num_q_nets self.m_sample = m_sample self.net_arch = net_arch self.learning_starts = learning_starts self.batch_size = batch_size self.gradient_updates = gradient_updates self.device = th.device('cuda' if th.cuda.is_available() else 'cpu' ) if device == 'auto' else device self.replay_buffer = ReplayBuffer(self.observation_dim, self.action_dim, max_size=buffer_size) self.q_nets = [ SoftQNetwork(self.observation_dim + self.action_dim, net_arch=net_arch).to(self.device) for _ in range(num_q_nets) ] self.target_q_nets = [ SoftQNetwork(self.observation_dim + self.action_dim, net_arch=net_arch).to(self.device) for _ in range(num_q_nets) ] for q_net, target_q_net in zip(self.q_nets, self.target_q_nets): target_q_net.load_state_dict(q_net.state_dict()) for param in target_q_net.parameters(): param.requires_grad = False self.policy = Policy(self.observation_dim, self.action_dim, self.env.action_space, net_arch=net_arch).to(self.device) self.target_entropy = -th.prod(th.Tensor( self.env.action_space.shape)).item() if alpha == 'auto': self.log_alpha = th.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp().item() self.alpha_optim = optim.Adam([self.log_alpha], lr=self.learning_rate) else: self.alpha_optim = None self.alpha = alpha q_net_params = [] for q_net in self.q_nets: q_net_params += list(q_net.parameters()) self.q_optim = optim.Adam(q_net_params, lr=self.learning_rate) self.policy_optim = optim.Adam(list(self.policy.parameters()), lr=self.learning_rate) self.mbpo = mbpo if self.mbpo: self.dynamics = ProbabilisticEnsemble( input_dim=self.observation_dim + self.action_dim, output_dim=self.observation_dim + 1, device=self.device) self.dynamics_buffer = ReplayBuffer(self.observation_dim, self.action_dim, max_size=400000) self.dynamics_rollout_len = dynamics_rollout_len self.rollout_dynamics_starts = rollout_dynamics_starts self.real_ratio = real_ratio self.experiment_name = experiment_name if experiment_name is not None else f"sac_{int(time.time())}" self.log = log if self.log: self.writer = SummaryWriter(f"runs/{self.experiment_name}") if wandb: import wandb wandb.init(project=project_name, sync_tensorboard=True, config=self.get_config(), name=self.experiment_name, monitor_gym=True, save_code=True) self.writer = SummaryWriter(f"/tmp/{self.experiment_name}") def get_config(self): return { 'env_id': self.env.unwrapped.spec.id, 'learning_rate': self.learning_rate, 'num_q_nets': self.num_q_nets, 'batch_size': self.batch_size, 'tau': self.tau, 'gamma': self.gamma, 'net_arch': self.net_arch, 'gradient_updates': self.gradient_updates, 'm_sample': self.m_sample, 'buffer_size': self.buffer_size, 'learning_starts': self.learning_starts, 'mbpo': self.mbpo, 'dynamics_rollout_len': self.dynamics_rollout_len } def save(self, save_replay_buffer=True): save_dir = 'weights/' if not os.path.isdir(save_dir): os.makedirs(save_dir) saved_params = { 'policy_state_dict': self.policy.state_dict(), 'policy_optimizer_state_dict': self.policy_optim.state_dict(), 'log_alpha': self.log_alpha, 'alpha_optimizer_state_dict': self.alpha_optim.state_dict() } for i, (q_net, target_q_net) in enumerate(zip(self.q_nets, self.target_q_nets)): saved_params['q_net_' + str(i) + '_state_dict'] = q_net.state_dict() saved_params['target_q_net_' + str(i) + '_state_dict'] = target_q_net.state_dict() saved_params['q_nets_optimizer_state_dict'] = self.q_optim.state_dict() if save_replay_buffer: saved_params['replay_buffer'] = self.replay_buffer th.save(saved_params, save_dir + "/" + self.experiment_name + '.tar') def load(self, path, load_replay_buffer=True): params = th.load(path) self.policy.load_state_dict(params['policy_state_dict']) self.policy_optim.load_state_dict( params['policy_optimizer_state_dict']) self.log_alpha = params['log_alpha'] self.alpha_optim.load_state_dict(params['alpha_optimizer_state_dict']) for i, (q_net, target_q_net) in enumerate(zip(self.q_nets, self.target_q_nets)): q_net.load_state_dict(params['q_net_' + str(i) + '_state_dict']) target_q_net.load_state_dict(params['target_q_net_' + str(i) + '_state_dict']) self.q_optim.load_state_dict(params['q_nets_optimizer_state_dict']) if load_replay_buffer and 'replay_buffer' in params: self.replay_buffer = params['replay_buffer'] def sample_batch_experiences(self): if not self.mbpo or self.num_timesteps < self.rollout_dynamics_starts: return self.replay_buffer.sample(self.batch_size, to_tensor=True, device=self.device) else: num_real_samples = int(self.batch_size * 0.05) # 5% of real world data s_obs, s_actions, s_rewards, s_next_obs, s_dones = self.replay_buffer.sample( num_real_samples, to_tensor=True, device=self.device) m_obs, m_actions, m_rewards, m_next_obs, m_dones = self.dynamics_buffer.sample( self.batch_size - num_real_samples, to_tensor=True, device=self.device) experience_tuples = (th.cat([s_obs, m_obs], dim=0), th.cat([s_actions, m_actions], dim=0), th.cat([s_rewards, m_rewards], dim=0), th.cat([s_next_obs, m_next_obs], dim=0), th.cat([s_dones, m_dones], dim=0)) return experience_tuples def rollout_dynamics(self): # MBPO Planning with th.no_grad(): for _ in range( 4 ): # 4 samples of 25000 instead of 1 of 100000 to not allocate all gpu memory obs = self.replay_buffer.sample_obs(25000, to_tensor=True, device=self.device) fake_env = FakeEnv(self.dynamics, self.env.unwrapped.spec.id) for plan_step in range(self.dynamics_rollout_len): actions = self.policy(obs, deterministic=False) next_obs_pred, r_pred, dones, info = fake_env.step( obs, actions) obs, actions = obs.detach().cpu().numpy(), actions.detach( ).cpu().numpy() for i in range(len(obs)): self.dynamics_buffer.add(obs[i], actions[i], r_pred[i], next_obs_pred[i], dones[i]) nonterm_mask = ~dones.squeeze(-1) if nonterm_mask.sum() == 0: break obs = next_obs_pred[nonterm_mask] @property def dynamics_train_freq(self): if self.num_timesteps < 100000: return 250 else: return 1000 def train(self): for _ in range(self.gradient_updates): s_obs, s_actions, s_rewards, s_next_obs, s_dones = self.sample_batch_experiences( ) with th.no_grad(): next_actions, log_probs = self.policy.action_log_prob( s_next_obs) q_input = th.cat([s_next_obs, next_actions], dim=1) if self.m_sample is not None: # REDQ sampling q_targets = th.cat([ q_target(q_input) for q_target in np.random.choice( self.target_q_nets, self.m_sample, replace=False) ], dim=1) else: q_targets = th.cat( [q_target(q_input) for q_target in self.target_q_nets], dim=1) target_q, _ = th.min(q_targets, dim=1, keepdim=True) target_q -= self.alpha * log_probs.reshape(-1, 1) target_q = s_rewards + (1 - s_dones) * self.gamma * target_q sa = th.cat([s_obs, s_actions], dim=1) q_values = [q_net(sa) for q_net in self.q_nets] critic_loss = (1 / self.num_q_nets) * sum( [F.mse_loss(q_value, target_q) for q_value in q_values]) self.q_optim.zero_grad() critic_loss.backward() self.q_optim.step() # Polyak update for q_net, target_q_net in zip(self.q_nets, self.target_q_nets): for param, target_param in zip(q_net.parameters(), target_q_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) # Policy update actions, log_pi = self.policy.action_log_prob(s_obs) sa = th.cat([s_obs, actions], dim=1) q_values_pi = th.cat([q_net(sa) for q_net in self.q_nets], dim=1) if self.m_sample is not None: min_q_value_pi = th.mean(q_values_pi, dim=1, keepdim=True) else: min_q_value_pi, _ = th.min(q_values_pi, dim=1, keepdim=True) policy_loss = (self.alpha * log_pi - min_q_value_pi).mean() self.policy_optim.zero_grad() policy_loss.backward() self.policy_optim.step() # Automatic temperature learning if self.alpha_optim is not None: alpha_loss = (-self.log_alpha * (log_pi.detach() + self.target_entropy)).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp().item() # Log losses if self.log and self.num_timesteps % 100 == 0: self.writer.add_scalar("losses/critic_loss", critic_loss.item(), self.num_timesteps) self.writer.add_scalar("losses/policy_loss", policy_loss.item(), self.num_timesteps) self.writer.add_scalar("losses/alpha", self.alpha, self.num_timesteps) if self.alpha_optim is not None: self.writer.add_scalar("losses/alpha_loss", alpha_loss.item(), self.num_timesteps) def learn(self, total_timesteps): episode_reward = 0.0, num_episodes = 0 obs, done = self.env.reset(), False self.num_timesteps = 0 for step in range(1, total_timesteps + 1): self.num_timesteps += 1 if step < self.learning_starts: action = self.env.action_space.sample() else: with th.no_grad(): action = self.policy( th.tensor(obs).float().to( self.device)).detach().cpu().numpy() next_obs, reward, done, info = self.env.step(action) terminal = done if 'TimeLimit.truncated' not in info else not info[ 'TimeLimit.truncated'] self.replay_buffer.add(obs, action, reward, next_obs, terminal) if step >= self.learning_starts: if self.mbpo: if self.num_timesteps % self.dynamics_train_freq == 0: m_obs, m_actions, m_rewards, m_next_obs, m_dones = self.replay_buffer.get_all_data( ) X = np.hstack((m_obs, m_actions)) Y = np.hstack((m_rewards, m_next_obs - m_obs)) mean_holdout_loss = self.dynamics.train_ensemble(X, Y) self.writer.add_scalar("dynamics/mean_holdout_loss", mean_holdout_loss, self.num_timesteps) if self.num_timesteps >= self.rollout_dynamics_starts and self.num_timesteps % 250 == 0: self.rollout_dynamics() self.train() episode_reward += reward if done: obs, done = self.env.reset(), False num_episodes += 1 if num_episodes % 10 == 0: print( f"Episode: {num_episodes} Step: {step}, Ep. Reward: {episode_reward}" ) if self.log: self.writer.add_scalar("metrics/episode_reward", episode_reward, self.num_timesteps) episode_reward = 0.0 else: obs = next_obs if self.log: self.writer.close() self.env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) # model = AttentionSAC.init_from_env(env, # tau=config.tau, # pi_lr=config.pi_lr, # q_lr=config.q_lr, # gamma=config.gamma, # pol_hidden_dim=config.pol_hidden_dim, # critic_hidden_dim=config.critic_hidden_dim, # attend_heads=config.attend_heads, # reward_scale=config.reward_scale) # Model used to test with adversarial agent # model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run140\\model.pt") # print("Model instantiated") # Model used to test without adversarial agent model= AttentionSAC.init_from_save ("C:\\Users\\HP\\Desktop\\NTU\\FYP\\FYP Code\\MAAC\\Output\\run148\\model.pt") print("Model instantiated") replay_buffer = ReplayBuffer(config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 row_list = [] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents)] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) # print (rewards) # print (dones[0]) # env.render('human') replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if config.use_gpu: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config.num_updates): sample = replay_buffer.sample(config.batch_size, to_gpu=config.use_gpu) #print(sample) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') if (dones[0][0]): print("Breakin the epsiodeeeee at timestep", et_i) break et_i += 1 row_list.append((ep_i+1,et_i)) ep_rews = replay_buffer.get_average_rewards( et_i * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * et_i, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') with open('Timesteps_vs_Episodes.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(["Ep No", "Number of Timesteps"]) for row in row_list: writer.writerow(row) model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) #logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if(env=='simple_reference'): for i in range(2): agent_init_params.append({'num_in_pol': num_in_pol, 'num_out_pol': num_out_pol, 'num_in_critic': num_in_critic}) init_dict = {'gamma': gamma, 'tau': tau, 'lr': lr, 'hidden_dim': hidden_dim, 'alg_types': alg_types, 'agent_init_params': agent_init_params, 'discrete_action': discrete_action} maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 episode_average_rewards=[] hundred_episode_average_rewards=[] for ep_i in range(0, config.n_episodes, config.n_rollout_threads): if (ep_i%100==0 and ep_i>0): hundred_episode_average_rewards.append(np.mean(episode_average_rewards)) print('Rewards till',ep_i,'=',hundred_episode_average_rewards[-1]) print('Agent Actions=',torch_agent_actions) episode_average_rewards=[] ''' print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) ''' obs = env.reset() rewards_for_this_episode=[] # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) rewards_for_this_episode.append(np.mean(rewards)) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i)#, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') if ep_i>10000: print('Goal Color=',torch_obs[0]) print('Communication=',agent_actions[0]) env.render() time.sleep(0.01) if ep_i>100000: import ipdb ipdb.set_trace() ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) episode_average_rewards.append(np.sum(rewards_for_this_episode)) #for a_i, a_ep_rew in enumerate(ep_rews): #logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') plt.plot(100*np.array(range(1,config.n_episodes//100)),hundred_episode_average_rewards) plt.xlabel('Episode Number') plt.ylabel('Average Reward for 100 episodes') plt.title('Speaker Discrete and Mover Continuous') plt.show('plot.png') maddpg.save(run_dir / 'model.pt') env.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config.env_id, config.n_rollout_threads, run_num) envActionSpace = env.action_space envObservationSpace = env.observation_space model = AttentionSAC.init_from_env( envActionSpace, envObservationSpace, tau=config.tau, pi_lr=config.pi_lr, q_lr=config.q_lr, gamma=config.gamma, pol_hidden_dim=config.pol_hidden_dim, #128 critic_hidden_dim=config.critic_hidden_dim, #128 attend_heads=config.attend_heads, #4 reward_scale=config.reward_scale) replay_buffer = ReplayBuffer( config.buffer_length, model.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): #12 print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config.episode_length): #25 # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads ): # 100 steps across rollouts -> 4 updates model.prep_training(device='cpu') for u_i in range(config.num_updates): #4 sample = replay_buffer.sample(config.batch_size) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) if ep_i % config.save_interval < config.n_rollout_threads: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close()
def run(args, **args_dict): reward_flag, pos_flag = None, None save_data = {'reward': -1000., 'pos': 0.} # model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # curr_run = 'run1' # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # curr_run = 'run1' # else: # curr_run = 'run%i' % (max(exst_run_nums) + 1) # run_dir = model_dir / curr_run # log_dir = run_dir / 'logs' # os.makedirs(log_dir) th.manual_seed(args.seed) np.random.seed(args.seed) if not args.use_cuda or not th.cuda.is_available(): # th.set_num_threads(args.n_training_threads) FloatTensor = th.FloatTensor else: FloatTensor = th.cuda.FloatTensor env = make_parallel_env(**args_dict) maddpg = MADDPG.init_from_env(env, args) replay_buffer = ReplayBuffer( args.capacity, args.n_agents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, args.n_episodes, args.n_rollout_threads): ttt = time.time() obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, args.n_exploration_eps - ep_i) / args.n_exploration_eps scale_noise_i = args.final_noise_scale + ( args.init_noise_scale - args.final_noise_scale) * explr_pct_remaining maddpg.scale_noise(scale_noise_i) maddpg.reset_noise() print("Episodes %i-%i of %i, replay: %.2f, explore: %.2f" % (ep_i + 1, ep_i + 1 + args.n_rollout_threads, args.n_episodes, float(len(replay_buffer)) / replay_buffer.max_steps, scale_noise_i)) for et_i in range(args.max_steps): ttt = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ th.from_numpy(np.vstack(obs[:, i])).type(FloatTensor) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ ac.detach().cpu().numpy() for ac in torch_agent_actions ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(args.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += args.n_rollout_threads # # ttt2 = time.time() # print('1', ttt2 - ttt) # if (len(replay_buffer) >= args.batch_size and (t % args.steps_per_update) < args.n_rollout_threads): ttt = time.time() if args.use_cuda and th.cuda.is_available(): maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') # for u_i in range(args.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(args.batch_size, to_gpu=args.use_cuda and th.cuda.is_available(), norm_rews=args.norm_rews) _, _, _ = maddpg.update(sample, a_i) maddpg.update_all_targets() if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') # # ttt2 = time.time() # print('2', ttt2 - ttt) # # ep_rews = replay_buffer.get_average_rewards( # config.episode_length * config.n_rollout_threads) # for a_i, a_ep_rew in enumerate(ep_rews): # logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % args.test_interval < args.n_rollout_threads: ttt = time.time() obs = env.reset() if args.use_cuda and th.cuda.is_available(): maddpg.prep_rollouts(device='gpu') else: maddpg.prep_rollouts(device='cpu') # maddpg.prep_rollouts(device='cpu') with th.no_grad(): pos_total = 0. finish_ep = np.zeros(args.n_rollout_threads) r_total = np.zeros((args.n_rollout_threads, args.n_agents)) record_r = np.zeros(args.n_agents) for eval_i in range(args.max_steps): torch_obs = [ FloatTensor(np.vstack(obs[:, i])) for i in range(maddpg.nagents) ] torch_agent_actions = maddpg.step(torch_obs, explore=False) agent_actions = [ ac.detach().cpu().numpy() for ac in torch_agent_actions ] actions = [[ac[i] for ac in agent_actions] for i in range(args.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) r_total += rewards obs = next_obs for d_i in range(dones.shape[0]): if dones[d_i] or (eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.): # if eval_i == args.max_steps - 1 and finish_ep[d_i] == 0.: # print(d_i) pos_total += infos[d_i]['pos'] record_r += r_total[d_i] r_total[d_i] = [0., 0.] finish_ep[d_i] += 1 record_r /= finish_ep.sum() pos_total /= finish_ep.sum() # ttt2 = time.time() # print('3', ttt2 - ttt) # new_path = model_path + '/' + str(ep_i) + '.pt' has_saved = False if record_r.sum() > save_data['reward']: save_data['reward'] = record_r.sum() if save_data['reward'] > 0 and pos_total > 10.: # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) maddpg.save(new_path) if pos_total > save_data['pos']: save_data['pos'] = pos_total if record_r.sum( ) > 0 and pos_total > 10. and not has_saved: # pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) maddpg.save(new_path) if pos_total > 17.0: maddpg.save(new_path) if reward_flag is None: reward_flag = vis.line( X=np.arange(ep_i, ep_i + 1), Y=np.array([np.append(record_r, record_r.sum())]), opts=dict(ylabel='Test Reward', xlabel='Episode', title='Reward', legend=[ 'Agent-%d' % i for i in range(args.n_agents) ] + ['Total'])) else: vis.line(X=np.array( [np.array(ep_i).repeat(args.n_agents + 1)]), Y=np.array([np.append(record_r, record_r.sum())]), win=reward_flag, update='append') if pos_flag is None: pos_flag = vis.line(X=np.arange(ep_i, ep_i + 1), Y=np.array([pos_total]), opts=dict(ylabel='Length', xlabel='Episode', title='How far ?', legend=['position'])) else: vis.line(X=np.array([ep_i]), Y=np.array([pos_total]), win=pos_flag, update='append') # if ep_i % config.save_interval < config.n_rollout_threads: # os.makedirs(run_dir / 'incremental', exist_ok=True) # maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) # maddpg.save(run_dir / 'model.pt') # maddpg.save(run_dir / 'model.pt') env.close()
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def run(config): model_dir = Path('./models') / config["env_id"] / config["model_name"] if not model_dir.exists(): run_num = 1 else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: run_num = 1 else: run_num = max(exst_run_nums) + 1 curr_run = 'run%i' % run_num run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(run_num) np.random.seed(run_num) env = make_parallel_env(config["n_rollout_threads"], run_num) model = AttentionSAC.init_from_env( env, tau=config["tau"], pi_lr=config["pi_lr"], q_lr=config["q_lr"], gamma=config["gamma"], pol_hidden_dim=config["pol_hidden_dim"], critic_hidden_dim=config["critic_hidden_dim"], attend_heads=config["attend_heads"], reward_scale=config["reward_scale"]) # (** EDITED **) Set Replay Buffer # env.action_space, env.observation_space 의 shape를 iteration을 통해 버퍼 설정 replay_buffer = ReplayBuffer(config["buffer_length"], model.nagents, [115 for _ in range(model.nagents)], [19 for _ in range(model.nagents)]) t = 0 for ep_i in range(0, config["n_episodes"], config["n_rollout_threads"]): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config["n_rollout_threads"], config["n_episodes"])) obs = env.reset() model.prep_rollouts(device='cpu') for et_i in range(config["episode_length"]): print("episode : {} | step : {}".format(ep_i, et_i), end='\r') # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(model.nagents) ] # get actions as torch Variables torch_agent_actions = model.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config["n_rollout_threads"])] # Reform Actions list to fit on Football Env # Google Football 환경은 액션 리스트 (one hot encoded)가 아닌 정수값을 받음 actions_list = [[np.argmax(b) for b in a] for a in actions] # Step next_obs, rewards, dones, infos = env.step(actions_list) # Prevention of divergence # 안해주면 발산해서 학습 불가 (NaN) rewards = rewards - 0.000001 # Reform Done Flag list # replay buffer에 알맞도록 done 리스트 재구성 dones = (np.array([dones for _ in range(model.nagents)])).T replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config["n_rollout_threads"] if (len(replay_buffer) >= config["batch_size"] and (t % config["steps_per_update"]) < config["n_rollout_threads"]): if config["use_gpu"]: model.prep_training(device='gpu') else: model.prep_training(device='cpu') for u_i in range(config["num_updates"]): sample = replay_buffer.sample(config["batch_size"], to_gpu=config["use_gpu"]) model.update_critic(sample, logger=logger) model.update_policies(sample, logger=logger) model.update_all_targets() model.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config["episode_length"] * config["n_rollout_threads"]) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew * config["episode_length"], ep_i) if ep_i % config["save_interval"] < config["n_rollout_threads"]: model.prep_rollouts(device='cpu') os.makedirs(run_dir / 'incremental', exist_ok=True) model.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) model.save(run_dir / 'model.pt') model.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) replay_buffer = ReplayBuffer(config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print("Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): env.render() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') ep_rews = replay_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if isinstance(env.action_space[0], Box): discr_act = False get_shape = lambda x: x.shape[0] else: # Discrete discr_act = True get_shape = lambda x: x.n num_out_pol = get_shape(env.action_space[0]) agent_init_params = { 'num_in_pol': env.observation_space[0].shape[0], 'num_out_pol': num_out_pol, 'num_vars': len(env.agent_types) } maddpg = MADDPG(agent_init_params, nagents=len(env.agent_types), tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, discrete_action=discr_act) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ], config.hidden_dim * (maddpg.nagents - 1)) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() rnn_hidden = (torch.zeros( 1, config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1), config.hidden_dim), torch.zeros( 1, config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1), config.hidden_dim)) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=True) hid_to_store = (rnn_hidden[0].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1), rnn_hidden[1].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1)) next_hid_to_store = (new_rnn_hidden[0].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1), new_rnn_hidden[1].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1)) # convert actions to numpy arrays agent_actions = [ ac.data.numpy() for ac in torch_agent_actions.cpu() ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, hid_to_store, agent_actions, rewards, next_obs, next_hid_to_store, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, ep_i) maddpg.update_all_targets() rnn_hidden = new_rnn_hidden ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) print("Episode %i, reward for %i is " % (ep_i + 1, a_i), a_ep_rew) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()