def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if isinstance(env.action_space[0], Box): discr_act = False get_shape = lambda x: x.shape[0] else: # Discrete discr_act = True get_shape = lambda x: x.n num_out_pol = get_shape(env.action_space[0]) agent_init_params = { 'num_in_pol': env.observation_space[0].shape[0], 'num_out_pol': num_out_pol, 'num_vars': len(env.agent_types) } maddpg = MADDPG(agent_init_params, nagents=len(env.agent_types), tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, discrete_action=discr_act) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ], config.hidden_dim * (maddpg.nagents - 1)) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() rnn_hidden = (torch.zeros( 1, config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1), config.hidden_dim), torch.zeros( 1, config.n_rollout_threads * (maddpg.nagents) * (maddpg.nagents - 1), config.hidden_dim)) for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=True) hid_to_store = (rnn_hidden[0].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1), rnn_hidden[1].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1)) next_hid_to_store = (new_rnn_hidden[0].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1), new_rnn_hidden[1].detach().contiguous().view( config.n_rollout_threads, maddpg.nagents, -1)) # convert actions to numpy arrays agent_actions = [ ac.data.numpy() for ac in torch_agent_actions.cpu() ] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) replay_buffer.push(obs, hid_to_store, agent_actions, rewards, next_obs, next_hid_to_store, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, ep_i) maddpg.update_all_targets() rnn_hidden = new_rnn_hidden ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) print("Episode %i, reward for %i is " % (ep_i + 1, a_i), a_ep_rew) maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) env = make_env(config.env_id, discrete_action=False) if isinstance(env.action_space[0], Box): discr_act = False get_shape = lambda x: x.shape[0] else: # Discrete discr_act = True get_shape = lambda x: x.n num_out_pol = get_shape(env.action_space[0]) agent_init_params = {'num_in_pol': env.observation_space[0].shape[0], 'num_out_pol': num_out_pol, 'num_vars': 3} maddpg = MADDPG(agent_init_params, nagents = 3, hidden_dim=config.hidden_dim, discrete_action=discr_act) save_dict = torch.load(model_path) maddpg.agents.load_params(save_dict['agent_params']) ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') rnn_hidden = ( torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim), torch.zeros(1, config.n_rollout_threads * (maddpg.nagents)*(maddpg.nagents - 1), config.hidden_dim) ) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions, new_rnn_hidden = maddpg.step(torch_obs, rnn_hidden, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions.cpu()] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') rnn_hidden = new_rnn_hidden if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()