def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' # print(config.save_gifs) # print(model_path.parent) # print(type(model_path.parent)) if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(str(model_path)) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] # print(len(env.render('rgb_array', close=False))) # print(type(env.render('rgb_array', close=False))) # print(env.render('rgb_array', close=False)) frames.append(env.render('rgb_array', close=False)[0]) env.render('human', close=False) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array', close=False)[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human', close=False) if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(config): """ main entry func """ config = setup_evaluation(args) logger = ExperimentLogger(config.save_dir, log_std_out=True, use_tensorboard=False) # load agent from checkpoint if config.checkpoint > 0: model_path = "checkpoints/model_ep{}.ckpt".format(config.checkpoint) else: model_path = "model.ckpt" model_path = os.path.join(config.restore, model_path) maddpg = MADDPG.init_from_save(model_path) if config.copy_checkpoint: maddpg.save(config.save_dir + "/model.ckpt") # make env runner env_func = ENV_MAP[config.env] env = env_func(config.scenario, benchmark=False, show_visual_range=config.show_visual_range, **config.env_config) # evaluate rollouts = maddpg_rollouts(maddpg, env, config.n_episodes, config.episode_length, logger=logger, render=True, save_gifs=True, fps=20) # save rollouts if save_dir is not None: with open(os.path.join(save_dir, "eval_rollouts.pkl"), "w") as f: pickle.dump(rollouts, f) if config.save_gifs: if config.save_gifs_num < 0: gif_num = config.n_episodes else: gif_num = min(config.save_gifs_num, config.n_episodes) imageio.mimsave(os.path.join(save_dir, "eval_frames.gif"), rollouts["frames"][:gif_num], duration=ifi) env.close()
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' if not config.mixed_policies else model_path.parent / 'gifs_mixed' gif_path.mkdir(exist_ok=True) torch.manual_seed(config.seed) np.random.seed(config.seed) if config.mixed_policies: maddpg = MADDPG.init_from_directory( Path('./models') / config.env_id / config.model_name) else: maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, benchmark=True, discrete_action=maddpg.discrete_action) env.world.seed(config.seed) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval all_infos = np.empty( (config.n_episodes, config.episode_length, maddpg.nagents, 10)) all_positions = np.zeros( (config.n_episodes, config.episode_length, maddpg.nagents, 2)) for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]), requires_grad=False) for i in range(maddpg.nagents) ] all_positions[ep_i, t_i] = env.get_positions() # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.save_gifs: frames.append(env.render('rgb_array')[0]) # frames.append(env.world.viewers[0].render(return_rgb_array = True)) uncomment if local views visible calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if len(np.array(infos['n']).shape) < 4: all_infos[ep_i, t_i, :, :len(infos['n'][-1])] = np.array(infos['n']) if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close() if config.save_stats: stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed' stats_path.mkdir(exist_ok=True) save(f'{stats_path}/all_infos.npy', all_infos) save(f'{stats_path}/all_positions.npy', all_positions)
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' print("\n"+str(model_path)+"\n\n\n") if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval ##################################################################################################### # START EPISODES # ##################################################################################################### for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() # For RNN history buffer obs_tminus_0 = copy(obs) obs_tminus_1 = copy(obs) obs_tminus_2 = copy(obs) obs_tminus_3 = copy(obs) obs_tminus_4 = copy(obs) obs_tminus_5 = copy(obs) # TODO: obs_history shape different from main.py, so parameterize it based on "obs" # It is different because main.py can run multiple threads, so has an extra dimension obs_history = np.empty([3,108]) next_obs_history = np.empty([3,108]) if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render('human') ################################################################################################## # START TIME-STEPS # ################################################################################################## for t_i in range(config.episode_length): # Populate current history for RNN for a in range(3): # env.nagents #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:])) obs_history[a][:] = np.concatenate( (obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:], obs_tminus_3[a][:], obs_tminus_4[a][:], obs_tminus_5[a][:])) # Now, temp has history of 6 timesteps for each agent calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable rnn_torch_obs = [Variable(torch.Tensor(obs_history[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions = maddpg.step(rnn_torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] next_obs, rewards, dones, infos = env.step(actions) # Update histories obs_tminus_5 = copy(obs_tminus_4) obs_tminus_4 = copy(obs_tminus_3) obs_tminus_3 = copy(obs_tminus_2) obs_tminus_2 = copy(obs_tminus_1) obs_tminus_1 = copy(obs_tminus_0) obs_tminus_0 = copy(next_obs) # --------------------------------------# if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(config): original_model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) # if config.incremental is not None: # model_path = model_path / 'incremental' / ('model_ep%i.pt' % # config.incremental) # else: # model_path = model_path / 'model.pt' if config.save_gifs: gif_path = original_model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) # Model numbers in folder for stat runs rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001] stat_run_all_models = [] for r in rrange: print("Model :" + str(r)) model_path = original_model_path / 'incremental' / ('model_ep%i.pt' % r) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval stat_return_list = [] for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) #env.render('human') episode_reward = 0 for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) # get the global reward episode_reward += rewards[0][0] if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) #env.render('human') if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) # end of episodes (one-stat-run) stat_return_list.append(episode_reward / config.episode_length) # end of model stat_run_all_models.append(stat_return_list) env.close() pickling_on = open(str(original_model_path) + "/stat_runs", "wb") pkl.dump(stat_run_all_models, pickling_on) pickling_on.close()
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) ##################### INITIALIZE FROM SAVED? ########################### if init_from_saved: if model_path is not None: maddpg = MADDPG.init_from_save(model_path) print("Initialized from saved model") # -------------------------------------------------------------------- # else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim) # used for learning (updates) replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) # This is just to store the global rewards and not for updating the policies g_storage_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n for acsp in env.action_space ]) t = 0 for ep_i in range(0, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions, maddpg) ''' Reward Shaping using D++, D. The rewards now contain global as well as shaped rewards Keep the global for logging, and use the shaped rewards for updates ''' # Choose which reward to use use_dpp = True # DIFFERENCE REWARDS d_rewards = [] for n in range(maddpg.nagents): d_rewards.append([rewards[0][n][1]]) d_rewards = [d_rewards] d_rewards = np.array(d_rewards) # GLOBAL REWARDS g_rewards = [] for n in range(maddpg.nagents): g_rewards.append([rewards[0][n][0]]) g_rewards = [g_rewards] g_rewards = np.array(g_rewards) if use_dpp: rewards = d_rewards else: rewards = g_rewards # ----------------------------------------------------------- # # Buffer used for updates replay_buffer.push(obs, agent_actions, rewards, next_obs, dones) # push global rewards into g_replay_buffer for plotting g_storage_buffer.push(obs, agent_actions, g_rewards, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') else: maddpg.prep_training(device='cpu') for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') # Take out global reward from g_storage_buffer ep_rews = g_storage_buffer.get_average_rewards( config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalar('agent%i/mean_episode_rewards' % a_i, a_ep_rew, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): original_model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) # if config.incremental is not None: # model_path = model_path / 'incremental' / ('model_ep%i.pt' % # config.incremental) # else: # model_path = model_path / 'model.pt' # # print(model_path) ########################################################################### # FORCE MODEL PATH # ########################################################################### model_path_list = [] rrange = [1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001] # FOR EACH MODEL, DO STATISTICAL RUNS # for r in rrange: # model_path = model_path / 'incremental' / ('model_ep%i.pt' % r) ###################### SAVING STAT RUNS FOR EACH MODEL ################### stat_run_all_models = [] for r in rrange: model_path = original_model_path / 'incremental' / ('model_ep%i.pt' % r) if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, discrete_action=maddpg.discrete_action) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval ##################################################################################################### # CONFIGURATION FOR STATISTICAL RUNS (EPISODES) ##################################################################################################### ##################################################################################################### # START EPISODES # ##################################################################################################### stat_return_list = [] for ep_i in range(config.n_episodes): # number of stat runs print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() # For RNN history buffer obs_tminus_0 = copy(obs) obs_tminus_1 = copy(obs) obs_tminus_2 = copy(obs) obs_tminus_3 = copy(obs) obs_tminus_4 = copy(obs) obs_tminus_5 = copy(obs) # TODO: obs_history shape different from main.py, so parameterize it based on "obs" # It is different because main.py can run multiple threads, so has an extra dimension obs_history = np.empty([3, 108]) next_obs_history = np.empty([3, 108]) if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) #env.render('human') ################################################################################################## # START TIME-STEPS # ################################################################################################## episode_reward = 0 for t_i in range(config.episode_length): # Populate current history for RNN for a in range(3): # env.nagents #obs_history[a][:] = np.concatenate((obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:])) obs_history[a][:] = np.concatenate( (obs_tminus_0[a][:], obs_tminus_1[a][:], obs_tminus_2[a][:], obs_tminus_3[a][:], obs_tminus_4[a][:], obs_tminus_5[a][:])) # Now, temp has history of 6 timesteps for each agent calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable rnn_torch_obs = [ Variable(torch.Tensor(obs_history[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_actions = maddpg.step(rnn_torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] next_obs, rewards, dones, infos = env.step(actions) # get the global reward episode_reward += rewards[0][0] # Update histories obs_tminus_5 = copy(obs_tminus_4) obs_tminus_4 = copy(obs_tminus_3) obs_tminus_3 = copy(obs_tminus_2) obs_tminus_2 = copy(obs_tminus_1) obs_tminus_1 = copy(obs_tminus_0) obs_tminus_0 = copy(next_obs) # --------------------------------------# if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) #env.render('human') # end of an episode if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) # end of episodes (one stat-run) stat_return_list.append(episode_reward / config.episode_length) # end of model stat_run_all_models.append(stat_return_list) env.close() pickling_on = open(str(original_model_path) + "/stat_runs", "wb") pkl.dump(stat_run_all_models, pickling_on) pickling_on.close()
see_runs = [0] wait = 0.05 ep_len = 50 for cur_run in see_runs: for i in range(4): config = Arglist() config.load_args(base_path / models_to_compare[cur_model] / ("run" + str(cur_run))) env = make_parallel_env(config) model_path = base_path / models_to_compare[cur_model] / ( "run" + str(cur_run)) / "model.pt" print(model_path) # add comm to action space: maddpg = MADDPG.init_from_save(model_path) # show some examples: obs = env.reset() # env.env._render("human", True) maddpg.prep_rollouts(device='cpu') # eval_model(maddpg, env, ep_len=100, num_steps=500, rollout_threads=1, display=True) for step in range(ep_len): env.env._render("human", False) time.sleep(wait) # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ]
with open(config_file, 'r') as f: args_dict = yaml.safe_load(f) args_dict['n_agents'] = args_dict['n_pursuers'] # config = yaml.load(f, Loader=loader) args = SN(**args_dict) if args.seed is not False: th.manual_seed(args.seed) env = Env(**args_dict) if args.seed is not False: env.seed(args.seed) for m in pool_list: maddpg = MADDPG.init_from_save(result_folder + '/model/' + m + '.pt', with_cpu=True) maddpg.prep_rollouts(device='cpu') with th.no_grad(): total_reward = 0. test_time = 5 for it in range(test_time): l = [] obs = env.reset() l.append(env.render(gui=True)) obs = np.stack(obs, axis=0) obs = th.from_numpy(obs).float() print('----------') reward_it = 0. for t in range(args.test_max_steps): obs = obs.type(th.FloatTensor) actions = maddpg.step(obs, explore=False)
def run(config): model_dir = Path('./models') / config.env_id / config.model_name if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run log_dir = run_dir / 'logs' os.makedirs(log_dir) logger = SummaryWriter(str(log_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) if not USE_CUDA: torch.set_num_threads(config.n_training_threads) env = make_parallel_env(config.env_id, config.n_rollout_threads, config.seed, config.discrete_action) if config.run_num: model_path = model_dir / f'run{config.run_num}' maddpg = MADDPG.init_from_save(model_path / 'model.pt') models_dir = model_path / 'incremental' ext_mods = [ int(str(folder.name).split('model_ep')[1][:-3]) for folder in models_dir.iterdir() if str(folder.name).startswith('model_ep') and str(folder.name).endswith('.pt') ] ep_st = np.sort(ext_mods)[-1] else: maddpg = MADDPG.init_from_env(env, agent_alg=config.agent_alg, adversary_alg=config.adversary_alg, tau=config.tau, lr=config.lr, hidden_dim=config.hidden_dim, recurrent=config.recurrent, convolutional=config.convolutional) ep_st = 0 replay_buffer = ReplayBuffer( config.buffer_length, maddpg.nagents, [obsp.shape[0] for obsp in env.observation_space] if not config.convolutional else [obsp.shape for obsp in env.observation_space], [ acsp.shape[0] if isinstance(acsp, Box) else acsp.n if isinstance(acsp, Discrete) else sum(acsp.high - acsp.low + 1) for acsp in env.action_space ]) t = 0 intrinsic_modules = create_intrinsic_motivators(config, maddpg.agents, env) for ep_i in range(ep_st, config.n_episodes, config.n_rollout_threads): print( "Episodes %i-%i of %i" % (ep_i + 1, ep_i + 1 + config.n_rollout_threads, config.n_episodes)) obs = env.reset() # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') [im.prep_rollouts(device='cpu') for im in intrinsic_modules] explr_pct_remaining = max( 0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() for et_i in range(config.episode_length): start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(np.vstack(obs[:, i])), requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=True) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(config.n_rollout_threads)] next_obs, rewards, dones, infos = env.step(actions) emps = np.sum(np.asarray( [im.compute(rewards, next_obs) for im in intrinsic_modules]), axis=0) replay_buffer.push(obs, agent_actions, rewards, emps, next_obs, dones) obs = next_obs t += config.n_rollout_threads if (len(replay_buffer) >= config.batch_size and (t % config.steps_per_update) < config.n_rollout_threads): if USE_CUDA: maddpg.prep_training(device='gpu') [ im.prep_training(device='gpu') for im in intrinsic_modules ] else: maddpg.prep_training(device='cpu') [ im.prep_training(device='cpu') for im in intrinsic_modules ] for u_i in range(config.n_rollout_threads): for a_i in range(maddpg.nagents): sample = replay_buffer.sample(config.batch_size, to_gpu=USE_CUDA) maddpg.update(sample, a_i, logger=logger) [ im.update(sample, logger=logger) for im in intrinsic_modules ] maddpg.update_all_targets() maddpg.prep_rollouts(device='cpu') [im.prep_rollouts(device='cpu') for im in intrinsic_modules] print( f'computation time = {time.time() - start:.3f}s buffer length = {len(replay_buffer)}' ) ep_rews = replay_buffer.get_average_rewards(config.episode_length * config.n_rollout_threads) for a_i, a_ep_rew in enumerate(ep_rews): logger.add_scalars('agent%i/mean_episode_rewards' % a_i, {'rew_loss': a_ep_rew}, ep_i) if ep_i % config.save_interval < config.n_rollout_threads: os.makedirs(run_dir / 'incremental', exist_ok=True) maddpg.save(run_dir / 'incremental' / ('model_ep%i.pt' % (ep_i + 1))) maddpg.save(run_dir / 'model.pt') maddpg.save(run_dir / 'model.pt') env.close() logger.export_scalars_to_json(str(log_dir / 'summary.json')) logger.close()
def run(config): model_path = (Path('../models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' gif_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed' gif_path.mkdir(exist_ok=True) torch.manual_seed(config.seed) np.random.seed(config.seed) if config.mixed_policies: maddpg = MADDPG.init_from_directory( Path('../models') / config.env_id / config.model_name) else: maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, benchmark=True, discrete_action=maddpg.discrete_action) env.seed(config.seed) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval all_infos = np.empty( (config.n_episodes, config.episode_length, maddpg.nagents, 10)) n_movable_agents = sum([1 if a.movable else 0 for a in env.agents]) n_speaking_agents = sum([0 if a.silent else 1 for a in env.agents]) all_positions = np.zeros((config.n_episodes, config.episode_length, n_movable_agents, env.world.dim_p)) all_communications = np.zeros((config.n_episodes, config.episode_length, n_speaking_agents, env.world.dim_c)) all_actions = np.zeros((config.n_episodes, config.episode_length, len(env.agents), env.world.dim_c)) obs_space = sum([obsp.shape[0] for obsp in env.observation_space]) all_obs = np.zeros((config.n_episodes, config.episode_length, obs_space)) for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() # env.agents[1].state.p_pos = np.array([0., 0.]) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [ Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) if not obs[i].ndim == 4 else Variable(torch.Tensor(obs[i]), requires_grad=False) for i in range(maddpg.nagents) ] all_positions[ep_i, t_i] = env.get_positions() all_communications[ep_i, t_i] = env.get_communications() # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] # actions[0] = np.array([0., 0., 0., 0., 0.], dtype=np.float32) # actions[0][ep_i] = 1. obs, rewards, dones, infos = env.step(actions) all_actions[ep_i, t_i, :, :] = actions all_obs[ep_i, t_i, :] = np.concatenate(np.asarray(obs)) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) if len(np.array(infos['n']).shape) < 4: all_infos[ep_i, t_i, :, :len(infos['n'][-1])] = np.array(infos['n']) env.close() if config.save_stats: stats_path = model_path.parent / 'stats' if not config.mixed_policies else model_path.parent / 'stats_mixed' stats_path.mkdir(exist_ok=True) save(f'{stats_path}/all_infos.npy', all_infos) save(f'{stats_path}/all_positions.npy', all_positions) save(f'{stats_path}/all_communications.npy', all_communications) save(f'{stats_path}/all_actions.npy', all_actions) save(f'{stats_path}/all_observations.npy', all_obs)
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) shape_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num) / config.shape_file) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) #env = make_env(config.env_id, discrete_action=maddpg.discrete_action) env=HeavyObjectEnv( num_agents=config.num_agents,shape_file=shape_path) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames = [] frames.append(env.render('rgb_array')[0]) env.render() for t_i in range(config.episode_length): calc_start = time.time() if t_i ==15: env.change_centroid(0.3,0.3) print("change centroid!") # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions_one_hot = [ac.data.numpy().flatten() for ac in torch_actions] #print(actions_one_hot) actions = np.array([i.tolist().index(1.0) for i in actions_one_hot]) #print(actions,len(actions)) #print(env._state,env._last_value) print(t_i) #for j in actions: # j[1]*=np.pi #print(actions,"new") obs, rewards, dones, infos = env.step(actions) #print(dones) if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render() if config.save_gifs: gif_num = 0 while (gif_path / ('%i_%i.gif' % (gif_num, ep_i))).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i_%i.gif' % (gif_num, ep_i))), frames, duration=ifi) env.close()
def run(config): # model_dir = Path('./models') / config.env_id / config.model_name # if not model_dir.exists(): # curr_run = 'run1' # else: # exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in # model_dir.iterdir() if # str(folder.name).startswith('run')] # if len(exst_run_nums) == 0: # curr_run = 'run1' # else: # curr_run = 'run%i' % (max(exst_run_nums) + 1) # run_dir = model_dir / curr_run # log_dir = run_dir / 'logs' # os.makedirs(log_dir) # logger = SummaryWriter(str(log_dir)) # torch.manual_seed(config.seed) # np.random.seed(config.seed) # if not USE_CUDA: # torch.set_num_threads(config.n_training_threads) # transport configuration name = 'Materials Transport' conf = { 'n_player': 2, # 玩家数量 'board_width': 11, # 地图宽 'board_height': 11, # 地图高 'n_cell_type': 5, # 格子的种类 'materials': 4, # 集散点数量 'cars': 2, # 汽车数量 'planes': 0, # 飞机数量 'barriers': 12, # 固定障碍物数量 'max_step': 50, # 最大步数 'game_name': name, # 游戏名字 'K': 5, # 每个K局更新集散点物资数目 'map_path': 'env/map.txt', # 存放初始地图 'cell_range': 6, # 单格中各维度取值范围(tuple类型,只有一个int自动转为tuple)##? 'ob_board_width': None, # 不同智能体观察到的网格宽度(tuple类型),None表示与实际网格相同##? 'ob_board_height': None, # 不同智能体观察到的网格高度(tuple类型),None表示与实际网格相同##? 'ob_cell_range': None, # 不同智能体观察到的单格中各维度取值范围(二维tuple类型),None表示与实际网格相同##? } env = make_parallel_env_transport(config.env_id, conf, config.seed, config.discrete_action) model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' maddpg = MADDPG.init_from_save(model_path) maddpg.prep_rollouts(device='cpu') t = 0 reward_epi = np.zeros(config.n_episodes) for ep_i in range(0, config.n_episodes): obs = env.reset() # TODO: TO CHECK ''' # obs.shape = (n_rollout_threads, nagent)(nobs), nobs differs per agent so not tensor maddpg.prep_rollouts(device='cpu') explr_pct_remaining = max(0, config.n_exploration_eps - ep_i) / config.n_exploration_eps maddpg.scale_noise(config.final_noise_scale + (config.init_noise_scale - config.final_noise_scale) * explr_pct_remaining) maddpg.reset_noise() ''' reward_eti = 0 pygame.init() screen = pygame.display.set_mode((440, 440)) # pygame.display.set_caption(g.game_name) clock = pygame.time.Clock() for et_i in range(config.episode_length): # env.render() # rearrange observations to be per agent, and convert to torch Variable # print('step', et_i) torch_obs = [ Variable( torch.Tensor(np.vstack(obs[:, i])), # 沿着竖直方向将矩阵堆叠起来。 requires_grad=False) for i in range(maddpg.nagents) ] # get actions as torch Variables torch_agent_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays agent_actions = [ac.data.numpy() for ac in torch_agent_actions] # rearrange actions to be per environment actions = [[ac[i] for ac in agent_actions] for i in range(1)] print(actions) ############################################ # add # actions = actions.astype(int) ############################################ # add: 前两个action joint_action = [] for i in range(2): player = [] for j in range(1): each = [0] * 11 idx = np.random.randint(11) each[idx] = 1 player.append(each) joint_action.append(player) for m in range(2): joint_action.append([actions[0][m].astype(int).tolist()]) next_obs, rewards, dones, infos = env.step(joint_action) obs = next_obs reward_eti += rewards[0][0] pygame.surfarray.blit_array(screen, env.render().transpose(1, 0, 2)) pygame.display.flip() clock.tick(1) fname = "./image/" + str(et_i) + ".png" # save image pygame.image.save(screen, fname)
def run(config): model_path = (Path('./models') / config.env_id / config.model_name / ('run%i' % config.run_num)) #model_path = config.path if config.incremental is not None: model_path = model_path / 'incremental' / ('model_ep%i.pt' % config.incremental) else: model_path = model_path / 'model.pt' if config.save_gifs: gif_path = model_path.parent / 'gifs' gif_path.mkdir(exist_ok=True) maddpg = MADDPG.init_from_save(model_path) env = make_env(config.env_id, config.benchmark, discrete_action=maddpg.discrete_action) print(type(env)) maddpg.prep_rollouts(device='cpu') ifi = 1 / config.fps # inter-frame interval if config.save_gifs: frames = [] agent_info = [[[]]] reward_info = [] trajectories = [] for ep_i in range(config.n_episodes): print("Episode %i of %i" % (ep_i + 1, config.n_episodes)) obs = env.reset() if config.save_gifs: frames.append(env.render('rgb_array')[0]) env.render('human') episode_rewards = np.zeros((config.episode_length, maddpg.nagents)) current_trajectory = [] current_entities = [] if config.store_traj: cur_state_ent = env.getStateEntities() current_entities.append(cur_state_ent) cur_state = env.getState() current_trajectory.append(cur_state) for t_i in range(config.episode_length): calc_start = time.time() # rearrange observations to be per agent, and convert to torch Variable torch_obs = [Variable(torch.Tensor(obs[i]).view(1, -1), requires_grad=False) for i in range(maddpg.nagents)] # get actions as torch Variables torch_actions = maddpg.step(torch_obs, explore=False) # convert actions to numpy arrays actions = [ac.data.numpy().flatten() for ac in torch_actions] obs, rewards, dones, infos = env.step(actions) if config.store_traj: cur_state = env.getState() current_trajectory.append(cur_state) if config.benchmark: for i, info in enumerate(infos): agent_info[-1][i].append(infos['n']) if config.sparse_reward: if t_i == 0: total = np.array(rewards) if t_i!=config.episode_length-1: total = total + np.array(rewards) rewards = list(np.zeros(len(rewards))) else: rewards = list(total) episode_rewards[t_i] = rewards if config.save_gifs: frames.append(env.render('rgb_array')[0]) calc_end = time.time() elapsed = calc_end - calc_start if config.save_gifs: if elapsed < ifi: time.sleep(ifi - elapsed) env.render('human') agent_info.append([[]]) mean_rewards = np.mean(episode_rewards, axis=0) reward_info.append(mean_rewards) if config.store_traj: trajectories.append([current_entities, current_trajectory]) if config.save_gifs: gif_num = 0 while (gif_path / ('%i.gif' % gif_num)).exists(): gif_num += 1 imageio.mimsave(str(gif_path / ('%i.gif' % gif_num)), frames, duration=ifi) run_dir = model_path.parent if config.benchmark: with open(run_dir / 'eval_info.pkl', 'wb') as fp: pickle.dump(agent_info, fp) with open(run_dir / 'eval_rew.pkl', 'wb') as fp: pickle.dump(reward_info, fp) if config.store_traj: with open(run_dir / 'static_trajectories_eval.pkl', 'wb') as fp: pickle.dump(trajectories, fp) env.close()