def evaluate(args, seed, policies_list, ob_rms=None, render=True, env=None, master=None, render_attn=True): """ RL evaluation: supports eval through training code as well as independently policies_list should be a list of policies of all the agents; len(policies_list) = num agents """ env = utils.make_single_env(args) master = setup_master(args, env) # if env is None or master is None: # if any one of them is None, generate both of them # master, env = setup_master(args, return_env=True) if seed is None: # ensure env eval seed is different from training seed seed = np.random.randint(0, 100000) print("Evaluation Seed: ", seed) env.seed(seed) if ob_rms is not None: obs_mean, obs_std = ob_rms else: obs_mean = None obs_std = None # print('eval flag 2') master.load_models(policies_list) master.set_eval_mode() # print('eval flag 3') num_eval_episodes = args.num_eval_episodes all_episode_rewards = np.full((num_eval_episodes, env.n), 0.0) per_step_rewards = np.full((num_eval_episodes, env.n), 0.0) # TODO: provide support for recurrent policies and mask recurrent_hidden_states = None mask = None # world.dists at the end of episode for simple_spread final_min_dists = [] num_success = 0 episode_length = 0 # print('eval flag 4') for t in range(num_eval_episodes): print('t', t) obs = env.reset() # obs = normalize_obs(obs, obs_mean, obs_std) done = [False] * env.n episode_rewards = np.full(env.n, 0.0) episode_steps = 0 # print('eval flag 5') if render: attn = None if not render_attn else master.team_attn if attn is not None and len(attn.shape) == 3: attn = attn.max(0) env.render() ##attn=attn) # print('eval flag 6') # while not np.all(done): for i in range(args.num_env_steps): actions = [] masks = torch.FloatTensor( obs[:, 0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() with torch.no_grad(): actions = master.act(obs, mask) episode_steps += 1 obs, reward, done, info = env.step(actions) # obs = normalize_obs(obs, obs_mean, obs_std) episode_rewards += np.array(reward) if render: time.sleep(0.1) attn = None if not render_attn else master.team_attn if attn is not None and len(attn.shape) == 3: attn = attn.max(0) env.render() #attn=attn) if args.record_video: time.sleep(0.08) if done: break # print('eval flag 7') per_step_rewards[t] = episode_rewards / episode_steps # print('info[n][0]', info['n'][0]) num_success += 0 ##* info['n'][0]['is_success'] episode_length = 0 ##* (episode_length*t + info['n'][0]['world_steps'])/(t+1) # for simple spread env only if args.env_name == 'simple_spread': final_min_dists.append(env.world.min_dists) elif args.env_name == 'simple_formation' or args.env_name == 'simple_line': final_min_dists.append(env.world.dists) # print('can you see that i am stuck') # if render: # print("Ep {} | Success: {} \n Av per-step reward: {:.2f} | Ep Length {}".format(t,info['n'][0]['is_success'], # per_step_rewards[t][0],info['n'][0]['world_steps'])) all_episode_rewards[ t, :] = episode_rewards # all_episode_rewards shape: num_eval_episodes x num agents if args.record_video: # print(attn) input('Press enter to continue: ') return all_episode_rewards, per_step_rewards, final_min_dists, num_success, episode_length
def train(args, policies_list, return_early=False): writer = SummaryWriter( args.log_dir) # some issue in importing tensorboardX # env = utils.make_parallel_envs(args) env = utils.make_single_env(args) master = setup_master(args, env) # used during evaluation only eval_master, eval_env = setup_master(args, return_env=True) obs = env.reset() # shape - num_agents (total) x obs_dim if args.continue_training: master.load_models(policies_list) n = len(master.all_agents) # final_rewards = torch.zeros([args.num_processes, n], device=args.device) # start simulations start = datetime.datetime.now() shift = int(args.ckpt) + 1 if args.continue_training else 0 for j in range(shift, args.num_updates + shift): # iterations end_pts = [ ] # maintain a list of time end points of episodes that can be used in master.wrap_horizon. Each entry is (where the episode got over) + 1 episode_rewards = torch.zeros([args.num_processes, n], device=args.device) print('j (num update)', j) ## training print('training') master.initialize_obs(obs) step = 0 while step < args.num_steps: # data collection steps in each iteration # print('step', step, 'train_fortattack') masks = torch.FloatTensor( obs[:, 0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() with torch.no_grad(): actions_list, attn_list = master.act(step, masks) ## IMPORTANT agent_actions = np.array(actions_list).reshape(-1) obs, reward, done, info = env.step(agent_actions) # print('reward') # print(reward) # env.render(attn_list) # attn_list = [[teamates_attn, opp_attn] for each team] # time.sleep(0.01) # obs, newdead = obs_newdead # print('obs', obs.shape, 'masks', masks.shape) # print('done', done) reward = torch.from_numpy(np.stack(reward)).float().to(args.device) ##* Don't know what final_reward means # print(masks.shape, episode_rewards.shape) # print(masks) # print(episode_rewards) # print(masks.dtype, episode_rewards.dtype) # print(((1 - masks) * episode_rewards).dtype, (1-masks).dtype,masks.dtype, episode_rewards.dtype) # episode_rewards *= masks # final_rewards += episode_rewards # it is (1-masks)*.., but i think it should be masks*... # print('episode_rewards') # print(episode_rewards) # final_rewards *= masks master.update_rollout( obs, reward, masks) ## adds the data point to the rolloutstorage episode_rewards += reward * masks # print('step reward', reward) # print('done', done) ## once masks is used, we can update it ## it's just easier to use masks # masks = torch.FloatTensor(1-1.0*done).to(args.device) # mask is to not use rewards for agents step += 1 ##* need to confirm this if done: end_pts.append(step) # time.sleep(1) obs = env.reset() masks = torch.FloatTensor( obs[:, 0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() master.initialize_new_episode(step, obs, masks) # break # print('') if end_pts[-1] != args.num_steps: end_pts.append(args.num_steps) master.wrap_horizon( end_pts ) ## computes the return = (reward + gamam*value), IMPORTANT # master.before_update() vals = master.update( ) ## PPO update, IMPORTANT. Multiple iterations of PPO on the last episode value_loss = vals[:, 0] action_loss = vals[:, 1] dist_entropy = vals[:, 2] master.after_update() ## IMPORTANT ## Saving trained model if j % args.save_interval == 0 and not args.test: print('saving') savedict = { 'models': [ agent.actor_critic.state_dict() for agent in master.all_agents ] } ob_rms = (None, None) if env.ob_rms is None else (env.ob_rms[0].mean, env.ob_rms[0].var) savedict['ob_rms'] = ob_rms savedir = args.save_dir + '/ep' + str(j) + '.pt' print(savedir) torch.save(savedict, savedir) total_num_steps = (j + 1) * args.num_processes * args.num_steps # Logginhg data to tensorboardX if j % args.log_interval == 0: print('logging') end = datetime.datetime.now() seconds = (end - start).total_seconds() total_reward = episode_rewards.sum(dim=0).cpu().numpy() print('total_reward') print(total_reward) print( "Updates {} | Num timesteps {} | Time {} | FPS {}\nTotal reward {}\nEntropy {:.4f} Value loss {:.4f} Policy loss {:.4f}\n" .format(j, total_num_steps, str(end - start), int(total_num_steps / seconds), total_reward, dist_entropy[0], value_loss[0], action_loss[0])) if not args.test: for idx in range(n): writer.add_scalar('agent' + str(idx) + '/training_reward', total_reward[idx], j) print('idx', idx, 'total_reward[idx]', total_reward[idx]) writer.add_scalar('all/value_loss', value_loss[0], j) writer.add_scalar('all/action_loss', action_loss[0], j) writer.add_scalar('all/dist_entropy', dist_entropy[0], j) # ## evaluation/validation # if args.eval_interval is not None and j%args.eval_interval==0: # print('evaluating') # ob_rms = (None, None) if env.ob_rms is None else (env.ob_rms[0].mean, env.ob_rms[0].var) # print('===========================================================================================') # _, eval_perstep_rewards, final_min_dists, num_success, eval_episode_len = evaluate(args, None, master.all_policies, # ob_rms=ob_rms, env=eval_env, # render = args.render, # master=eval_master) # print('Evaluation {:d} | Mean per-step reward {:.2f}'.format(j//args.eval_interval, eval_perstep_rewards.mean())) # print('Num success {:d}/{:d} | Episode Length {:.2f}'.format(num_success, args.num_eval_episodes, eval_episode_len)) # if final_min_dists: # print('Final_dists_mean {}'.format(np.stack(final_min_dists).mean(0))) # print('Final_dists_var {}'.format(np.stack(final_min_dists).var(0))) # print('===========================================================================================\n') # if not args.test: # writer.add_scalar('all/eval_success', 100.0*num_success/args.num_eval_episodes, j) # writer.add_scalar('all/episode_length', eval_episode_len, j) # for idx in range(n): # writer.add_scalar('agent'+str(idx)+'/eval_per_step_reward', eval_perstep_rewards.mean(0)[idx], j) # if final_min_dists: # writer.add_scalar('agent'+str(idx)+'/eval_min_dist', np.stack(final_min_dists).mean(0)[idx], j) # # print('flag3') # curriculum_success_thres = 0.9 # if return_early and num_success*1./args.num_eval_episodes > curriculum_success_thres: # savedict = {'models': [agent.actor_critic.state_dict() for agent in master.all_agents]} # ob_rms = (None, None) if env.ob_rms is None else (env.ob_rms[0].mean, env.ob_rms[0].var) # savedict['ob_rms'] = ob_rms # savedir = args.save_dir+'/ep'+str(j)+'.pt' # torch.save(savedict, savedir) # print('===========================================================================================\n') # print('{} agents: training complete. Breaking.\n'.format(args.num_agents)) # print('===========================================================================================\n') # break writer.close() if return_early: return savedir
def test_fortattack(args, seed, policies_list, ob_rms): writer = SummaryWriter( args.log_dir) # some issue in importing tensorboardX if seed is None: # ensure env eval seed is different from training seed seed = np.random.randint(0, 100000) print("Evaluation Seed: ", seed) env = utils.make_single_env(args) master = setup_master(args, env) obs = env.reset() # shape - num_agents (total) x obs_dim all_obs = [obs] # if ob_rms is not None: # obs_mean, obs_std = ob_rms # else: # obs_mean = None # obs_std = None master.load_models(policies_list) master.set_eval_mode() n = len(master.all_agents) # final_rewards = torch.zeros([args.num_processes, n], device=args.device) # start simulations start = datetime.datetime.now() for j in range(args.num_eval_episodes + 100): # iterations end_pts = [ ] # maintain a list of time end points of episodes that can be used in master.wrap_horizon. Each entry is (where the episode got over) + 1 episode_rewards = torch.zeros([args.num_processes, n], device=args.device) print('j (num update)', j) ## training print('training') print('end_pts at the begining', end_pts, 'train_fortattack.py') # print('obs', obs[:,0]) master.initialize_obs(obs) done = False step = 0 # if args.render(): # video_path = 'out_files/videos' # self.eval_env.startRecording(video_path) # if self.args.record_video: # self.eval_env.recordFrame() while not done: # data collection steps in each iteration # print('step', step, 'train_fortattack') masks = torch.FloatTensor( obs[:, 0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() with torch.no_grad(): actions_list, attn_list = master.act(step, masks) ## IMPORTANT agent_actions = np.array(actions_list).reshape(-1) obs, reward, done, info = env.step(agent_actions) all_obs.append(obs) # obs = normalize_obs(obs, obs_mean, obs_std) # print('reward') # print(reward) env.render( attn_list ) # attn_list = [[teamates_attn, opp_attn] for each team] path = 'out_files/Frames/generalize_5_{}_{}.png'.format(j, step) # env.saveFrame(attn_list, path) # time.sleep(0.06) # obs, newdead = obs_newdead # print('obs', obs.shape, 'masks', masks.shape) # print('done', done) reward = torch.from_numpy(np.stack(reward)).float().to(args.device) ##* Don't know what final_reward means # print(masks.shape, episode_rewards.shape) # print(masks) # print(episode_rewards) # print(masks.dtype, episode_rewards.dtype) # print(((1 - masks) * episode_rewards).dtype, (1-masks).dtype,masks.dtype, episode_rewards.dtype) # episode_rewards *= masks # final_rewards += episode_rewards # it is (1-masks)*.., but i think it should be masks*... # print('episode_rewards') # print(episode_rewards) # final_rewards *= masks master.update_rollout( obs, reward, masks) ## adds the data point to the rolloutstorage episode_rewards += reward * masks # print('step reward', reward) # print('done', done) ## once masks is used, we can update it ## it's just easier to use masks # masks = torch.FloatTensor(1-1.0*done).to(args.device) # mask is to not use rewards for agents step += 1 ##* need to confirm this if done: end_pts.append(step) # time.sleep(1) obs = env.reset() masks = torch.FloatTensor( obs[:, 0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() master.initialize_new_episode(step, obs, masks) time.sleep(1) if not args.out_file is None: all_obs = np.array(all_obs) print('all_obs', all_obs.shape) print('path', args.out_file) np.save(args.out_file, all_obs) break # print('') # if end_pts[-1] != args.num_steps: # end_pts.append(args.num_steps) # master.wrap_horizon(end_pts) ## computes the return = (reward + gamam*value), IMPORTANT # # master.before_update() # vals = master.update() ## PPO update, IMPORTANT. Multiple iterations of PPO on the last episode # value_loss = vals[:, 0] # action_loss = vals[:, 1] # dist_entropy = vals[:, 2] # master.after_update() ## IMPORTANT # for agent in master.all_agents: # print('after_update', agent.rollouts.obs.shape, 'train_fortattack.py') print('episode_rewqards') print(episode_rewards)
def test_fortattack(args, seed): writer = SummaryWriter(args.log_dir) # some issue in importing tensorboardX if seed is None: # ensure env eval seed is different from training seed seed = np.random.randint(0,100000) print("Evaluation Seed: ",seed) env = utils.make_single_env(args) master = setup_master(args, env) obs = env.reset() # shape - num_agents (total) x obs_dim all_obs = [obs] # if ob_rms is not None: # obs_mean, obs_std = ob_rms # else: # obs_mean = None # obs_std = None # stats = np.zeros((num_attacker_ckpts, 4+2+2)) # gameResult, num_alive for guards/acttackers, rewards for guards/attackers guard_load_dir = os.path.join('./marlsave/save_new', args.guard_load_dir) guard_ckpt_files = [file for file in os.listdir(guard_load_dir) if os.path.isfile(os.path.join(guard_load_dir,file)) and file.endswith('.pt')] guard_ckpt_names = np.sort([int(file[2:-3]) for file in guard_ckpt_files]) guard_ckpt_files = ['ep'+str(name)+'.pt' for name in guard_ckpt_names] num_attacker_ckpts = len(args.attacker_ckpts) num_episodes = args.num_eval_episodes data = np.zeros((len(guard_ckpt_files), num_attacker_ckpts*num_episodes, 11)) # start simulations start = datetime.datetime.now() for k,ckpt in enumerate(guard_ckpt_files): checkpoint = torch.load(os.path.join(guard_load_dir, ckpt), map_location=lambda storage, loc: storage) policies_list = checkpoint['models'] ob_rms = checkpoint['ob_rms'] master.load_models(policies_list) ## we are done setting the guards master.set_eval_mode() n, n_guards, n_attackers = len(master.all_agents), master.env.world.numGuards, master.env.world.numAliveAttackers for i,attacker_ckpt in enumerate(args.attacker_ckpts): # iterate through attacker strategies print('Playing against attacker strategy {}, ckpt {}'.format(i, attacker_ckpt)) master.select_attacker(attacker_ckpt) # data = np.zeros((args.num_eval_episodes, 4+2+2)) # gameResult, num_alive for guards/acttackers, rewards for guards/attackers for j in range(num_episodes): # 3 test runs for each strategy episode_rewards = torch.zeros([args.num_processes, n], device=args.device) master.initialize_obs(obs) step = 0 done = False while not done: # while episode is not over masks = torch.FloatTensor(obs[:,0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() with torch.no_grad(): actions_list, attn_list = master.act(step, masks) ## IMPORTANT agent_actions = np.array(actions_list).reshape(-1) obs, reward, done, info = env.step(agent_actions) all_obs.append(obs) if args.render: env.render(attn_list) # attn_list = [[teamates_attn, opp_attn] for each team] time.sleep(0.06) reward = torch.from_numpy(np.stack(reward)).float().to(args.device) master.update_rollout(obs, reward, masks) ## adds the data point to the rolloutstorage episode_rewards += reward*masks step += 1 if done: # print(master.env.world.gameResult) # data[j,:2] = master.env.world.gameResult[:2] # data[j,2] = data[j,0] + data[j,1] # data[j,3] = master.env.world.gameResult[2] # data[j,4] = master.env.world.numAliveGuards # data[j,5] = master.env.world.numAliveAttackers # data[j,6] = np.average(episode_rewards[0,0:n_guards]) # data[j,7] = np.average(episode_rewards[0,n_guards:]) data[k,i*num_episodes+j,0] = guard_ckpt_names[k] data[k,i*num_episodes+j,1:] = episode_rewards.cpu().numpy() obs = env.reset() masks = torch.FloatTensor(obs[:,0]) ##* check the values of masks, agent alive or dead if not args.no_cuda: masks = masks.cuda() if args.render: time.sleep(2) if not args.out_file is None: all_obs = np.array(all_obs) print('all_obs', all_obs.shape) print('path', args.out_file) np.save(args.out_file, all_obs) break print('episode_rewards') print(episode_rewards) # time.sleep(2) # stats[i] = data.mean(axis = 0) # stats = stats.round(2) # print('stats') # print(stats) # np.savetxt("marlsave/stats/stats_ensemble_strategies.csv", stats, delimiter=",") np.save(os.path.join(guard_load_dir,'reward_data_ensemble'), data)