def evaluate_policy(actor_critic, envs, args, eval_log_dir, device): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) eval_envs.close() return eval_episode_rewards
def play_game(self, level): eval_envs = make_vec_envs(env_name, self.seed + self.num_processes, self.num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( self.num_processes, self.actor_critic.recurrent_hidden_state_size).to(self.device) eval_masks = torch.zeros(self.num_processes, 1).to(self.device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32).to(device) if (done): print("Done!") eval_envs.close()
def main(repeat_num): args = get_args() print("start the train function") args.init_sigma = 0.6 args.lr = 0.001 device = torch.device("cpu") # plot_weight_histogram(parameters) actor_critic_policy = torch.load( "/Users/djrg/code/instincts/modular_rl_safety_gym/trained_models/pulled_from_server/double_rl_experiments/policy_plus_instinct/ba00287951_0_dense_seesaw_phase/model_rl_policy.pt") actor_critic_instinct = torch.load( "/Users/djrg/code/instincts/modular_rl_safety_gym/trained_models/pulled_from_server/double_rl_experiments/policy_plus_instinct/ba00287951_0_dense_seesaw_phase/model_rl_instinct.pt") # Init the environment env_name = "Safexp-PointGoal1-v0" eval_envs = make_vec_envs(env_name, np.random.randint(2 ** 32), 1, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) ob_rms = utils.get_vec_normalize(eval_envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms for _ in range(repeat_num): fits, info = evaluate(EvalActorCritic(actor_critic_policy, actor_critic_instinct), ob_rms, eval_envs, 1, device, instinct_on=True, visualise=True) print(f"fitness = {fits.item()}, cost = {info['cost']}")
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device, custom_gym, gif=False): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True, custom_gym) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) images = [] while len(eval_episode_rewards) < 10: with torch.no_grad(): images.append(obs[0, -3:, :].squeeze().cpu().numpy()) _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) images.append(obs[0, -3:, :].squeeze().cpu().numpy()) eval_envs.close() if gif: array2gif.write_gif(images, 'replay.gif', fps=4) config.tensorboard.run.log( {"video": wandb.Video('replay.gif', fps=4, format="gif")}, commit=True) config.tensorboard.run.history._flush() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device, action_sample=1, nr_episodes=10, deterministic=True): eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < nr_episodes: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=deterministic) if type(action_sample) is int: obs, reward, done, infos = eval_envs.step(action.squeeze()) else: obs, reward, done, infos = eval_envs.step(action) # obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print("Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) return np.mean(eval_episode_rewards)
def save_model(args, policy, envs, iteration, sub_dir='ckpt'): os.makedirs(os.path.join(args.log_dir, sub_dir), exist_ok=True) if 'cuda' in args.device: policy = copy.deepcopy( policy).cpu() # apparently a really ugly way to save to CPU save_model = [ policy, getattr(get_vec_normalize(envs.envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(args.log_dir, sub_dir, 'iteration_{}.pt'.format(iteration)))
def evaluate(actor_critic, ob_rms, args, device, logger, step): eval_envs = make_vec_envs(args, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print( " Evaluation using {} episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(len(eval_episode_rewards), np.mean(eval_episode_rewards), np.median(eval_episode_rewards), np.min(eval_episode_rewards), np.max(eval_episode_rewards))) logger.scalar_summary('eval/mean_reward', np.mean(eval_episode_rewards), step) logger.scalar_summary('eval/median_reward', np.median(eval_episode_rewards), step) logger.scalar_summary('eval/min_reward', np.min(eval_episode_rewards), step) logger.scalar_summary('eval/max_reward', np.max(eval_episode_rewards), step)
def evaluate_first_ep(actor_critic, ob_rms, num_processes, device, eval_envs=None, eps=0.0): vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = dict({}) # obs = eval_envs.reset() # Do not reset - take step obs, _, done, infos = eval_envs.step( torch.tensor([np.random.randint(eval_envs.action_space.n) for _ in range(num_processes)]).unsqueeze(1) ) eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < num_processes: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=False, eps=eps) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for idx, info in enumerate(infos): if 'episode' in info.keys() and idx not in eval_episode_rewards: eval_episode_rewards[idx] = info['episode']['r'] eval_envs.close() eval_episode_rewards = list(eval_episode_rewards.values()) # print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( # len(eval_episode_rewards), np.mean(eval_episode_rewards))) return { "eval_reward": eval_episode_rewards }
def __init__(self, env_name, policy, true_scales, renders, processes=1, seeds=True, **kwargs): self.env_name = env_name self.policy = policy self.true_scales = true_scales extra_dict = {} self.processes = processes extra_dict['render'] = renders #save_final_states = 0 self.extra_dict = {**extra_dict, **kwargs} self.seed = 1 torch.manual_seed(self.seed) if seeds: np.random.seed(self.seed) random.seed(self.seed) is_cuda = False self.device = "cuda" if is_cuda else "cpu" path = os.path.join(self.policy, self.env_name + ".pt") if is_cuda: self.actor_critic, self.ob_rms = torch.load(path) else: self.actor_critic, self.ob_rms = torch.load(path, map_location="cpu") self.recurrent_hidden_states = torch.zeros( 1, self.actor_critic.recurrent_hidden_state_size) self.masks = torch.zeros(1, self.processes) self.extra_dict['scales'] = self.true_scales self.env = make_vec_envs(self.env_name, self.seed, self.processes, None, None, device=self.device, allow_early_resets=False, **self.extra_dict) vec_norm = get_vec_normalize(self.env) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = self.ob_rms
def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, device, eval_envs=None, clip_action=False, a_low=-1, a_high=1): num_processes = 1 seed = 1 if eval_envs is None: eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs if clip_action: obs, _, done, infos = eval_envs.step(torch.clamp(action, min=a_low, max=a_high)) else: obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor( [[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) # eval_envs.close() # print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( # len(eval_episode_rewards), np.mean(eval_episode_rewards))) return eval_episode_rewards
def save_model(model, envs, save_dir, model_name, use_cuda): save_path = os.path.join(save_dir) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = model if use_cuda: save_model = copy.deepcopy(model).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, model_name + ".pt"))
def evaluate(actor_critic, obs_rms, eval_envs_dic, env_name, seed, num_processes, num_tasks, eval_log_dir, device, **kwargs): eval_envs = eval_envs_dic[env_name] eval_episode_rewards = [] for iter in range(0, num_tasks, num_processes): for i in range(num_processes): eval_envs.set_task_id(task_id=iter + i, indices=i) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.obs_rms = obs_rms obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) # while len(eval_episode_rewards) < 1: for t in range(kwargs["steps"]): with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) # if len(eval_episode_rewards) > 98: # print(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) # eval_envs.close() # print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( # len(eval_episode_rewards), np.mean(eval_episode_rewards))) return eval_episode_rewards
def evaluate(actor_critic, env_name, seed, num_processes, eval_log_dir, device, num_evals): eval_envs = mve(env_name, seed + num_processes, num_processes, None, eval_log_dir, device, True, num_frame_stack=1) vec_norm = utils.get_vec_normalize(eval_envs) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) while len(eval_episode_rewards) < num_evals: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states, _, _ = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) return np.mean(eval_episode_rewards)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") #envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, device, False) envs = make_parallel_env(args.env_name, args.num_processes, args.seed, True) ''' actor_critic = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) ''' actor_critic = [] for i in range(args.agent_num): ac = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, agent_i = i, base_kwargs={'recurrent': args.recurrent_policy}) ac.to(device) actor_critic.append(ac) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': ''' agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) ''' agent = [] for i in range(args.agent_num): agent.append(algo.PPO( actor_critic[i], i, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, model_dir = args.model_dir)) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ''' rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs[:,0,:])) rollouts.to(device) ''' rollouts = [] for i in range(args.agent_num): rollout = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic[i].recurrent_hidden_state_size, args.agent_num, i) rollouts.append(rollout) obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) for j in range(num_updates): #pdb.set_trace() if args.use_linear_lr_decay: # decrease learning rate linearly for i in range(args.agent_num): utils.update_linear_schedule(agent[i].optimizer, j, num_updates, agent[i].optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions value_list, action_list, action_log_prob_list, recurrent_hidden_states_list = [], [], [], [] with torch.no_grad(): for i in range(args.agent_num): #pdb.set_trace() value, action, action_log_prob, recurrent_hidden_states = actor_critic[i].act( rollouts[i].share_obs[step], rollouts[i].obs[step], rollouts[i].recurrent_hidden_states[step], rollouts[i].masks[step]) # import pdb; pdb.set_trace() value_list.append(value) action_list.append(action) action_log_prob_list.append(action_log_prob) recurrent_hidden_states_list.append(recurrent_hidden_states) # Obser reward and next obs action = [] for i in range(args.num_processes): one_env_action = [] for k in range(args.agent_num): one_hot_action = np.zeros(envs.action_space[0].n) one_hot_action[action_list[k][i]] = 1 one_env_action.append(one_hot_action) action.append(one_env_action) #start = time.time() #pdb.set_trace() obs, reward, done, infos = envs.step(action) # print(obs[0][0]) # pdb.set_trace() #end = time.time() #print("step time: ", end-start) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. ''' masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done[0]]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos[0]]) ''' masks = torch.ones(args.num_processes, 1) bad_masks = torch.ones(args.num_processes, 1) ''' rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) ''' #import pdb; pdb.set_trace() for i in range(args.agent_num): rollouts[i].insert(torch.tensor(obs.reshape(args.num_processes, -1)), torch.tensor(obs[:,i,:]), recurrent_hidden_states, action_list[i], action_log_prob_list[i], value_list[i], torch.tensor(reward[:, i].reshape(-1,1)), masks, bad_masks) #import pdb; pdb.set_trace() with torch.no_grad(): next_value_list = [] for i in range(args.agent_num): next_value = actor_critic[i].get_value( rollouts[i].share_obs[-1], rollouts[i].obs[-1], rollouts[i].recurrent_hidden_states[-1], rollouts[i].masks[-1]).detach() next_value_list.append(next_value) if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) for i in range(args.agent_num): rollouts[i].compute_returns(next_value_list[i], args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #import pdb; pdb.set_trace() for i in range(args.agent_num): value_loss, action_loss, dist_entropy = agent[i].update(rollouts[i]) if (i == 0): print("value loss: " + str(value_loss)) # print(value_loss) # pdb.set_trace() #rollouts.after_update() obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) # save for every interval-th episode or for the last epoch #pdb.set_trace() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) if not os.path.exists(save_path + args.model_dir): os.makedirs(save_path + args.model_dir) for i in range(args.agent_num): torch.save([ actor_critic[i], getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], save_path + args.model_dir + '/agent_%i' % (i+1) + ".pt") ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ''' '''
def main(): tb_path = os.path.join(os.path.expanduser(args.log_dir), "tensorboard_log") makedir_if_not_exists(tb_path) writer = SummaryWriter(tb_path) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # p = multiprocessing.Process(target=_tb_task,args=(tb_path,5013) ,daemon=True) # p.start() if args.start_tb: _tb_task(tb_path, port=5013) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_eps = 0 # num training eps num_steps = 0 # num training eps for j in range(num_updates): # list of all values all eps in num updates num_steps_basline_info = defaultdict(list) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) env_basline_info = defaultdict(list) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: # episode is done # add addisiotnal baseline rw info in infos: if 'basline_rw_mse' in info: env_basline_info['rw_mse'].append(info['basline_rw_mse']) env_basline_info['rw_rec'].append(info['basline_rw_rec']) if 'basline_rw_tcn' in info: env_basline_info['rw_tcn'].append(info['basline_rw_tcn']) if 'episode' in info.keys(): # end of episode episode_rewards.append(info['episode']['r']) num_steps_basline_info['len_episode'].append( info['episode']['l']) # distance of the pushed block num_steps_basline_info['push_distance'].append( info['basline_rw_push_dist']) # take mean over eps for k, step_vals in env_basline_info.items(): num_steps_basline_info[k].append(np.sum(step_vals)) # add baseline infos num_eps += 1 env_basline_info = defaultdict(list) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps # write baseline finfos for tcn writer_step = total_num_steps for k, vals_step_eps in num_steps_basline_info.items(): writer.add_scalar('basline/' + k, np.mean(vals_step_eps), writer_step) writer.add_scalar('basline/episodes', num_eps, writer_step) len_eps = np.mean(num_steps_basline_info['len_episode']) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log.info( "Updates {}, num timesteps {}, FPS {} Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, len eps {}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), int(len_eps), dist_entropy, value_loss, action_loss)) if j == num_updates or (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): vid_log_dir = os.getenv('TCN_ENV_VID_LOG_FOLDER', '/tmp/env_tcn/train_vid') vid_log_inter = os.getenv('TCN_ENV_VID_LOG_INTERVAL', train_vid_log_iter) os.environ[ 'TCN_ENV_VID_LOG_FOLDER'] = "eval_vid" # os.path.join(vid_log_dir,"../eval_vid/","interval_"+str(j)) os.environ['TCN_ENV_VID_LOG_INTERVAL'] = '1' os.environ['TCN_ENV_EVAL_EPISODE'] = '1' with redirect_stdout(open(os.devnull, "w")): # no stdout with suppress_logging(): # eval envs eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, 1, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 1: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append( info['episode']['r']) eval_envs.close() os.environ['TCN_ENV_VID_LOG_FOLDER'] = vid_log_dir os.environ['TCN_ENV_EVAL_EPISODE'] = '0' os.environ['TCN_ENV_VID_LOG_INTERVAL'] = vid_log_inter writer.add_scalar('eval/rw', np.mean(eval_episode_rewards), j) log.info( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if j % args.vis_interval == 0: try: td_plot(writer, args.log_dir) # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) except IOError: print("plt error") pass
def onpolicy_main(): print("onpolicy main") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv') > -1: if args.num_processes > 1: visionnet_input = envs.venv.venv.visionnet_input nn = envs.venv.venv.nn env_name = envs.venv.venv.xml_path else: visionnet_input = envs.venv.venv.envs[ 0].env.env.env.visionnet_input nn = envs.venv.venv.envs[0].env.env.env.nn env_name = envs.venv.venv.envs[0].env.env.env.xml_path dummy_obs = np.zeros(nn * 2 + 3) else: dummy_obs = envs.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy(dummy_obs.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm = get_vec_normalize(envs) vec_norm.eval() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, dummy_obs.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) full_obs = envs.reset() initial_state = full_obs[:, :envs.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs, 0) else: obs = full_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) next_action = action if pos_control: frame_skip = 2 if step % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = envs.step(next_action) current_state = full_obs[:, :envs.action_space.shape[0]] else: full_obs, reward, done, infos = envs.step(next_action) # convert img to obs if door_env and using visionnet if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( save_path, args.env_name + "_{}.{}.pt".format(args.save_name, j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR = True #Domain Randomization ################## for multiprocess world change ###################### if DR: print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) full_obs = envs.reset() if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def train_ppo_from_scratch(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(2) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, True) actor_critic = Policy( # 2-layer fully connected network envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': False, 'hidden_size': 32 }) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episode_reward_means = [] episode_reward_times = [] for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) episode_reward_means.append(np.mean(episode_rewards)) episode_reward_times.append(total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) print(episode_reward_means, episode_reward_times) return episode_reward_means, episode_reward_times
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.env_name.startswith("lab_"): gym_name, flow_json = make_lab_env(args.env_name) args.env_name = gym_name envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: " "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) if config.cuda and torch.cuda.is_available() and config.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg, 'train', seed=config.seed) eval_log_dir = final_output_dir + "_eval" utils.cleanup_log_dir(final_output_dir) utils.cleanup_log_dir(eval_log_dir) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) writer = SummaryWriter(tb_log_dir) torch.set_num_threads(1) device = torch.device("cuda:" + config.GPUS if config.cuda else "cpu") width = height = 84 envs = make_vec_envs(config.env_name, config.seed, config.num_processes, config.gamma, final_output_dir, device, False, width=width, height=height, ram_wrapper=False) # create agent actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': config.recurrent_policy, 'hidden_size': config.hidden_size, 'feat_from_selfsup_attention': config.feat_from_selfsup_attention, 'feat_add_selfsup_attention': config.feat_add_selfsup_attention, 'feat_mul_selfsup_attention_mask': config.feat_mul_selfsup_attention_mask, 'selfsup_attention_num_keypoints': config.SELFSUP_ATTENTION.NUM_KEYPOINTS, 'selfsup_attention_gauss_std': config.SELFSUP_ATTENTION.GAUSS_STD, 'selfsup_attention_fix': config.selfsup_attention_fix, 'selfsup_attention_fix_keypointer': config.selfsup_attention_fix_keypointer, 'selfsup_attention_pretrain': config.selfsup_attention_pretrain, 'selfsup_attention_keyp_maps_pool': config.selfsup_attention_keyp_maps_pool, 'selfsup_attention_image_feat_only': config.selfsup_attention_image_feat_only, 'selfsup_attention_feat_masked': config.selfsup_attention_feat_masked, 'selfsup_attention_feat_masked_residual': config.selfsup_attention_feat_masked_residual, 'selfsup_attention_feat_load_pretrained': config.selfsup_attention_feat_load_pretrained, 'use_layer_norm': config.use_layer_norm, 'selfsup_attention_keyp_cls_agnostic': config.SELFSUP_ATTENTION.KEYPOINTER_CLS_AGNOSTIC, 'selfsup_attention_feat_use_ln': config.SELFSUP_ATTENTION.USE_LAYER_NORM, 'selfsup_attention_use_instance_norm': config.SELFSUP_ATTENTION.USE_INSTANCE_NORM, 'feat_mul_selfsup_attention_mask_residual': config.feat_mul_selfsup_attention_mask_residual, 'bottom_up_form_objects': config.bottom_up_form_objects, 'bottom_up_form_num_of_objects': config.bottom_up_form_num_of_objects, 'gaussian_std': config.gaussian_std, 'train_selfsup_attention': config.train_selfsup_attention, 'block_selfsup_attention_grad': config.block_selfsup_attention_grad, 'sep_bg_fg_feat': config.sep_bg_fg_feat, 'mask_threshold': config.mask_threshold, 'fix_feature': config.fix_feature }) # init / load parameter if config.MODEL_FILE: logger.info('=> loading model from {}'.format(config.MODEL_FILE)) state_dict = torch.load(config.MODEL_FILE) state_dict = OrderedDict( (_k, _v) for _k, _v in state_dict.items() if 'dist' not in _k) actor_critic.load_state_dict(state_dict, strict=False) elif config.RESUME: checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) actor_critic.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) actor_critic.to(device) if config.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, train_selfsup_attention=config.train_selfsup_attention) elif config.algo == 'ppo': agent = algo.PPO(actor_critic, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) elif config.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, config.value_loss_coef, config.entropy_coef, acktr=True, train_selfsup_attention=config.train_selfsup_attention, max_grad_norm=config.max_grad_norm) # rollouts: environment rollouts = RolloutStorage( config.num_steps, config.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, keep_buffer=config.train_selfsup_attention, buffer_size=config.train_selfsup_attention_buffer_size) if config.RESUME: if os.path.exists(checkpoint_file): agent.optimizer.load_state_dict(checkpoint['optimizer']) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( config.num_env_steps) // config.num_steps // config.num_processes best_perf = 0.0 best_model = False print('num updates', num_updates, 'num steps', config.num_steps) for j in range(num_updates): if config.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if config.algo == "acktr" else config.lr) for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) recurrent_hidden_states, meta = recurrent_hidden_states # Obser reward and next obs obs, reward, done, infos = envs.step(action) objects_locs = [] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) if objects_locs: objects_locs = torch.FloatTensor(objects_locs) objects_locs = objects_locs * 2 - 1 # -1, 1 else: objects_locs = None rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, objects_loc=objects_locs) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.gae_lambda, config.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if config.train_selfsup_attention and j > 15: for _iter in range(config.num_steps // 5): frame_x, frame_y = rollouts.generate_pair_image() selfsup_attention_loss, selfsup_attention_output, image_b_keypoints_maps = \ agent.update_selfsup_attention(frame_x, frame_y, config.SELFSUP_ATTENTION) if j % config.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * config.num_processes * config.num_steps end = time.time() msg = 'Updates {}, num timesteps {}, FPS {} \n' \ 'Last {} training episodes: mean/median reward {:.1f}/{:.1f} ' \ 'min/max reward {:.1f}/{:.1f} ' \ 'dist entropy {:.1f}, value loss {:.1f}, action loss {:.1f}\n'. \ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) if config.train_selfsup_attention and j > 15: msg = msg + 'selfsup attention loss {:.5f}\n'.format( selfsup_attention_loss) logger.info(msg) if (config.eval_interval is not None and len(episode_rewards) > 1 and j % config.eval_interval == 0): total_num_steps = (j + 1) * config.num_processes * config.num_steps ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) eval_mean_score, eval_max_score, eval_scores = evaluate( actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, width=width, height=height) perf_indicator = eval_mean_score if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False # record test scores with open(os.path.join(final_output_dir, 'test_scores'), 'a+') as f: out_s = "TEST: {}, {}, {}, {}\n".format( str(total_num_steps), str(eval_mean_score), str(eval_max_score), [str(_eval_scores) for _eval_scores in eval_scores]) print(out_s, end="", file=f) logger.info(out_s) writer.add_scalar('data/mean_score', eval_mean_score, total_num_steps) writer.add_scalar('data/max_score', eval_max_score, total_num_steps) writer.add_scalars('test', {'mean_score': eval_mean_score}, total_num_steps) # save for every interval-th episode or for the last epoch if (j % config.save_interval == 0 or j == num_updates - 1) and config.save_dir != "": logger.info( "=> saving checkpoint to {}".format(final_output_dir)) epoch = j / config.save_interval save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': actor_critic.state_dict(), 'perf': perf_indicator, 'optimizer': agent.optimizer.state_dict(), 'ob_rms': getattr(utils.get_vec_normalize(envs), 'ob_rms', None) }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(actor_critic.state_dict(), final_model_state_file) # export_scalars_to_json needs results from add scalars writer.export_scalars_to_json(os.path.join(tb_log_dir, 'all_scalars.json')) writer.close()
def main(repeat_num): args = get_args() print("start the train function") args.init_sigma = 0.6 args.lr = 0.001 device = torch.device("cpu") # Init the environment # env_name = "Safexp-PointGoal1-v0" eval_envs = make_vec_envs(env_name, np.random.randint(2**32), 1, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) obs_shape = eval_envs.observation_space.shape actor_critic_policy = init_default_ppo(eval_envs, log(args.init_sigma)) # Prepare modified action space for instinct inst_action_space = deepcopy(eval_envs.action_space) inst_obs_shape = list(obs_shape) inst_obs_shape[0] = inst_obs_shape[0] + eval_envs.action_space.shape[0] inst_action_space.shape = list(inst_action_space.shape) inst_action_space.shape[0] = inst_action_space.shape[0] + 1 inst_action_space.shape = tuple(inst_action_space.shape) actor_critic_instinct = Policy(tuple(inst_obs_shape), inst_action_space, init_log_std=log(args.init_sigma), base_kwargs={'recurrent': False}) title = "baseline_pretrained_hh_10" # f = open(f"/Users/djgr/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BUTTON_more_space/{title}.csv", "w") actor_critic_policy = torch.load( # f"/Users/djgr/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space_more_time/hh_10_baseline_centered_noHaz/model_rl_policy_latest.pt" "/home/calavera/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space/hh_10/model_rl_policy_latest.pt" # "/home/calavera/code/ITU_work/IR2L_master/pretrained_policy.pt" ) actor_critic_instinct = torch.load( f"/home/calavera/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space/hh_10/model_rl_instinct_latest.pt" ) ob_rms = utils.get_vec_normalize(eval_envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms ob_rms = pickle.load( open( f"/home/calavera/pulled_from_server/evaluate_instinct_all_inputs_task_switch_button/real_safety_tasks_easier/sweep_eval_hazard_param_BOX_more_space/hh_10/ob_rms.p", "rb")) for _ in range(repeat_num): fits, info = evaluate( # EvalActorCritic(actor_critic_policy, actor_critic_instinct, det_policy=True, det_instinct=True), EvalActorCritic(actor_critic_policy, actor_critic_instinct), ob_rms, eval_envs, 1, reward_cost_combinator, device, instinct_on=True, visualise=True) visualise_values_over_path(info['plot_info']) # f.write(f"fitness; {fits.item()}; hazard_collisions; {info['hazard_collisions']}\n") # f.flush() print(f"{info['hazard_collisions']}") print( f"fitness; {fits.item()}; hazard_collisions; {info['hazard_collisions']}\n" )
def instinct_loop_ppo( args, learning_rate, num_steps, num_updates, inst_on, visualize, save_dir ): torch.set_num_threads(1) log_writer = SummaryWriter(save_dir, max_queue=1, filename_suffix="log") device = torch.device("cpu") env_name = ENV_NAME_BOX #"Safexp-PointGoal1-v0" envs = make_vec_envs(env_name, np.random.randint(2 ** 32), NUM_PROC, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) eval_envs = make_vec_envs(env_name, np.random.randint(2 ** 32), 1, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) actor_critic_policy = init_default_ppo(envs, log(args.init_sigma)) # Prepare modified observation shape for instinct obs_shape = envs.observation_space.shape inst_action_space = deepcopy(envs.action_space) inst_obs_shape = list(obs_shape) inst_obs_shape[0] = inst_obs_shape[0] + envs.action_space.shape[0] # Prepare modified action space for instinct inst_action_space.shape = list(inst_action_space.shape) inst_action_space.shape[0] = inst_action_space.shape[0] + 1 inst_action_space.shape = tuple(inst_action_space.shape) actor_critic_instinct = torch.load("pretrained_instinct_h100.pt") actor_critic_policy.to(device) actor_critic_instinct.to(device) agent_policy = algo.PPO( actor_critic_policy, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=learning_rate, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(num_steps, NUM_PROC, obs_shape, envs.action_space, actor_critic_policy.recurrent_hidden_state_size) obs = envs.reset() i_obs = make_instinct_input(obs, torch.zeros((NUM_PROC, envs.action_space.shape[0]))) # Add zero action to the observation rollouts.obs[0].copy_(obs) rollouts.to(device) fitnesses = [] best_fitness_so_far = float("-Inf") masks = torch.ones(num_steps + 1, NUM_PROC, 1) instinct_recurrent_hidden_states = torch.zeros(num_steps + 1, NUM_PROC, actor_critic_instinct.recurrent_hidden_state_size) for j in range(num_updates): training_collisions_current_update = 0 for step in range(num_steps): # Sample actions with torch.no_grad(): # (value, action, action_log_probs, rnn_hxs), (instinct_value, instinct_action, instinct_outputs_log_prob, i_rnn_hxs), final_action value, action, action_log_probs, recurrent_hidden_states = actor_critic_policy.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], deterministic=False ) instinct_value, instinct_action, instinct_outputs_log_prob, instinct_recurrent_hidden_states = actor_critic_instinct.act( i_obs, instinct_recurrent_hidden_states, masks, deterministic=False, ) # Combine two networks final_action, i_control = policy_instinct_combinator(action, instinct_action) obs, reward, done, infos = envs.step(final_action) #envs.render() training_collisions_current_update += sum([i['cost'] for i in infos]) modded_reward, violation_cost = reward_cost_combinator(reward, infos, NUM_PROC, i_control) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor([[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) # i_obs = torch.cat([obs, action], dim=1) i_obs = make_instinct_input(obs, action) rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, modded_reward, masks, bad_masks) with torch.no_grad(): next_value_policy = actor_critic_policy.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1].detach()) rollouts.compute_returns(next_value_policy, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) print("training policy") # Instinct training phase p_before = deepcopy(actor_critic_instinct) val_loss, action_loss, dist_entropy = agent_policy.update(rollouts) p_after = deepcopy(actor_critic_instinct) assert compare_two_models(p_before, p_after), "policy changed when it shouldn't" rollouts.after_update() ob_rms = utils.get_vec_normalize(envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms fits, info = evaluate(EvalActorCritic(actor_critic_policy, actor_critic_instinct), ob_rms, eval_envs, NUM_PROC, reward_cost_combinator, device, instinct_on=inst_on, visualise=visualize) instinct_reward = info['instinct_reward'] hazard_collisions = info['hazard_collisions'] print( f"Step {j}, Fitness {fits.item()}, value_loss instinct = {val_loss}, action_loss instinct= {action_loss}, " f"dist_entropy instinct = {dist_entropy}") print( f"Step {j}, Cost {instinct_reward}") print("-----------------------------------------------------------------") # Tensorboard logging log_writer.add_scalar("Task reward", fits.item(), j) log_writer.add_scalar("cost/Training hazard collisions", training_collisions_current_update, j) log_writer.add_scalar("cost/Instinct reward", instinct_reward, j) log_writer.add_scalar("cost/Eval hazard collisions", hazard_collisions, j) log_writer.add_scalar("value loss", val_loss, j) log_writer.add_scalar("action loss", action_loss, j) log_writer.add_scalar("dist entropy", dist_entropy, j) fitnesses.append(fits) if fits.item() > best_fitness_so_far: best_fitness_so_far = fits.item() torch.save(actor_critic_instinct, join(save_dir, "model_rl_instinct.pt")) torch.save(actor_critic_policy, join(save_dir, "model_rl_policy.pt")) torch.save(actor_critic_instinct, join(save_dir, "model_rl_instinct_latest.pt")) torch.save(actor_critic_policy, join(save_dir, "model_rl_policy_latest.pt")) torch.save(actor_critic_policy, join(save_dir, f"model_rl_policy_latest_{j}.pt")) pickle.dump(ob_rms, open(join(save_dir, "ob_rms.p"), "wb")) return (fitnesses[-1]), 0, 0
def main(): ARGUMENTS.update(vars(args)) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ALL_UPDATES.append(j) ALL_TIMESTEPS.append(total_num_steps) ALL_FPS.append(int(total_num_steps / (end - start))) ALL_MEAN_REWARDS.append(np.mean(episode_rewards)) ALL_MEDIAN_REWARDS.append(np.median(episode_rewards)) ALL_MIN_REWARDS.append(np.min(episode_rewards)) ALL_MAX_REWARDS.append(np.max(episode_rewards)) ALL_DIST_ENTROPY.append(dist_entropy) ALL_VALUE_LOSS.append(value_loss) ALL_ACTION_LOSS.append(action_loss) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass # Save the results name = ARGUMENTS['env_name'] + '-' + ARGUMENTS['algo'] + '-' + ARGUMENTS['experiment'] + '-grad_noise' + str(ARGUMENTS['gradient_noise']) experiment = ro.Experiment(name, directory='results') data = { 'updates': ALL_UPDATES, 'timesteps': ALL_TIMESTEPS, 'fps': ALL_FPS, 'mean_rewards': ALL_MEAN_REWARDS, 'median_rewards': ALL_MEDIAN_REWARDS, 'min_rewards': ALL_MIN_REWARDS, 'max_rewards': ALL_MAX_REWARDS, 'dist_entropy': ALL_DIST_ENTROPY, 'value_loss': ALL_VALUE_LOSS, 'action_loss': ALL_ACTION_LOSS, } data.update(ARGUMENTS) result = data['mean_rewards'][-1] experiment.add_result(result, data)
def train(args, envs, encoder, agent, actor_critic, device): rollouts = RolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, ) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.ppo_use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.ppo_lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_probs, recurrent_hidden_states, actor_features, dist_entropy = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], ) # Obser reward and next obs obs, reward, done, infos = envs.step(action) # TODO: Check that the encoder is not updated # TODO: Analyze features of vae and infonce-st encoder for info in infos: if "episode" in info.keys(): episode_rewards.append(info["episode"]["r"]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if "bad_transition" in info.keys() else [1.0] for info in infos] ) rollouts.insert( obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks, bad_masks, ) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ) rollouts.compute_returns( next_value, False, args.ppo_gamma, 0.0, args.use_proper_time_limits ) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if j % args.save_interval == 0 or j == num_updates - 1: torch.save( [actor_critic, getattr(utils.get_vec_normalize(envs), "ob_rms", None)], os.path.join(wandb.run.dir, args.env_name + ".pt"), ) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), ) ) wandb.log( { "updates": j, "total_num_steps": total_num_steps, "fps": int(total_num_steps / (end - start)), "episode_rewards_mean": np.mean(episode_rewards), "episode_rewards_median": np.median(episode_rewards), "episode_rewards_min": np.min(episode_rewards), "episode_rewards_max": np.max(episode_rewards), "entropy": dist_entropy, "value_loss": value_loss, "policy_loss": action_loss, } )
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir + args.env_name) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) log_dir2 = os.path.expanduser(args.log_dir2 + args.env_name2) eval_log_dir2 = log_dir + "_eval" utils.cleanup_log_dir(log_dir2) utils.cleanup_log_dir(eval_log_dir2) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") import json file_path = "config.json" setup_json = json.load(open(file_path, 'r')) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env_name: env_conf = setup_json[i] # 1 game envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, env_conf, False) # 2 game envs2 = make_vec_envs(args.env_name2, args.seed, args.num_processes, args.gamma, args.log_dir2, device, env_conf, False) save_model, ob_rms = torch.load('./trained_models/PongNoFrameskip-v4.pt') from a2c_ppo_acktr.cnn import CNNBase a = CNNBase(envs.observation_space.shape[0], recurrent=False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, #(obs_shape[0], ** base_kwargs) base=a, #base_kwargs={'recurrent': args.recurrent_policy} ) #actor_critic.load_state_dict(save_model.state_dict()) actor_critic.to(device) actor_critic2 = Policy(envs2.observation_space.shape, envs2.action_space, base=a) #base_kwargs={'recurrent': args.recurrent_policy}) #actor_critic2.load_state_dict(save_model.state_dict()) actor_critic2.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, actor_critic2, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts2 = RolloutStorage(args.num_steps, args.num_processes, envs2.observation_space.shape, envs2.action_space, actor_critic2.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) obs2 = envs2.reset() rollouts2.obs[0].copy_(obs2) rollouts2.to(device) episode_rewards = deque(maxlen=10) episode_rewards2 = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # if args.use_linear_lr_decay: # # decrease learning rate linearly # utils.update_linear_schedule( # agent.optimizer, j, num_updates, # agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) value2, action2, action_log_prob2, recurrent_hidden_states2, _ = actor_critic2.act( rollouts2.obs[step], rollouts2.recurrent_hidden_states[step], rollouts2.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) obs2, reward2, done2, infos2 = envs2.step(action2) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) for info2 in infos2: if 'episode' in info2.keys(): episode_rewards2.append(info2['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) masks2 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done2]) bad_masks2 = torch.FloatTensor( [[0.0] if 'bad_transition' in info2.keys() else [1.0] for info2 in infos2]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) rollouts2.insert(obs2, recurrent_hidden_states2, action2, action_log_prob2, value2, reward2, masks2, bad_masks2) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() next_value2 = actor_critic2.get_value( rollouts2.obs[-1], rollouts2.recurrent_hidden_states[-1], rollouts2.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) rollouts2.compute_returns(next_value2, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy, value_loss2, action_loss2, dist_entropy2 = agent.update( rollouts, rollouts2) rollouts.after_update() rollouts2.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) torch.save([ actor_critic2, getattr(utils.get_vec_normalize(envs2), 'ob_rms2', None) ], os.path.join(save_path, args.env_name2 + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards2), np.mean(episode_rewards2), np.median(episode_rewards2), np.min(episode_rewards2), np.max(episode_rewards2), dist_entropy2, value_loss2, action_loss2)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) ob_rms2 = utils.get_vec_normalize(envs2).ob_rms evaluate(actor_critic2, ob_rms2, args.env_name2, args.seed, args.num_processes, eval_log_dir2, device)
args.det = not args.non_det env = make_vec_envs(args.env_name, args.seed + 1000, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False) # Get a render function render_func = get_render_func(env) # We need to use the same statistics for normalization as used in training if args.load_model is not None: actor_critic, ob_rms = torch.load(args.load_model) else: actor_critic, ob_rms = torch.load(os.path.join(args.load_dir, args.env_name + "epoch_{:07d}.pt".format(args.load_epoch))) vec_norm = get_vec_normalize(env) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(1, 1) if render_func is not None: render_func('human') obs = env.reset() if args.env_name.find('Bullet') > -1: import pybullet as p
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, True) frame_skip = 4 # frame skip if args.tb_dir[-1] != '/': args.tb_dir = args.tb_dir + '/' logger = Logger(args.tb_dir) logger.write_settings(args) if args.use_tdm: # beta scheduler if args.beta_schedule == 'const': beta_func = lambda x: float(args.beta_int) elif args.beta_schedule == 'sqrt': beta_func = lambda x: 1. / np.sqrt(x + 2) elif args.beta_schedule == 'log': beta_func = lambda x: 1. / np.log(x + 2) elif args.beta_schedule == 'linear': beta_func = lambda x: 1. / (x + 2) # bonus function variations if args.bonus_func == 'linear': bonus_func = lambda x: x + 1 elif args.bonus_func == 'square': bonus_func = lambda x: (x + 1)**2 elif args.bonus_func == 'sqrt': bonus_func = lambda x: (x + 1)**(1 / 2) elif args.bonus_func == 'log': bonus_func = lambda x: np.log(x + 1) # temporal difference module tdm = TemporalDifferenceModule( inputSize=2 * int(envs.observation_space.shape[0]), outputSize=args.time_intervals, num_fc_layers=int(args.num_layers), depth_fc_layers=int(args.fc_width), lr=float(args.opt_lr), buffer_max_length=args.buffer_max_length, buffer_RL_ratio=args.buffer_RL_ratio, frame_skip=frame_skip, tdm_epoch=args.tdm_epoch, tdm_batchsize=args.tdm_batchsize, logger=logger, bonus_func=bonus_func).to(device) #collect random trajectories sample_collector = CollectSamples(envs, args.num_processes, initial=True) tdm.buffer_rand = sample_collector.collect_trajectories( args.num_rollouts, args.steps_per_rollout) # initial training tdm.update() actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # acting for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs # envs.render() obs_old = obs.clone() obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #compute intrinsic bonus if args.use_tdm: tdm.symm_eval = True if step == args.num_steps - 1 else False reward_int = tdm.compute_bonus(obs_old, obs).float() reward += beta_func( step + j * args.num_steps) * reward_int.cpu().unsqueeze(1) if (j % args.log_interval == 0) and (step == args.num_steps - 1): logger.add_reward_intrinsic(reward_int, (j + 1) * args.num_steps * args.num_processes) #saving to buffer. rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # saving to buffer and periodic updating parameters if (args.use_tdm): tdm.buffer_RL_temp.append((rollouts.obs, rollouts.masks)) if (j % args.num_steps == 0 and j > 0): tdm.update() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch # no # save every 1-million steps if (((j + 1) * args.num_steps * args.num_processes) % 1e6 == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] if j == num_updates - 1: save_here = os.path.join( save_path, args.env_name + "_step_{}M.pt".format( (j + 1) * args.num_steps * args.num_processes // 1e6)) else: save_here = os.path.join(save_path, args.env_name + "_final.pt") torch.save(save_model, save_here) # saved policy. total_num_steps = (j + 1) * args.num_processes * args.num_steps # printing outputs if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) logger.add_reward(episode_rewards, (j + 1) * args.num_steps * args.num_processes) # # if j % args.tb_interval == 0: # # mean/std or median/1stqt? # logger.add_tdm_loss(loss, self.epoch_count*i) # evaluation process # if (args.eval_interval is not None # and len(episode_rewards) > 1 # and j % args.eval_interval == 0): # eval_envs = make_vec_envs( # args.env_name, args.seed + args.num_processes, args.num_processes, # args.gamma, eval_log_dir, args.add_timestep, device, True) # # vec_norm = get_vec_normalize(eval_envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = get_vec_normalize(envs).ob_rms # # eval_episode_rewards = [] # # obs = eval_envs.reset() # eval_recurrent_hidden_states = torch.zeros(args.num_processes, # actor_critic.recurrent_hidden_state_size, device=device) # eval_masks = torch.zeros(args.num_processes, 1, device=device) # # while len(eval_episode_rewards) < 10: # with torch.no_grad(): # _, action, _, eval_recurrent_hidden_states = actor_critic.act( # obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # # # Obser reward and next obs # # envs.render() # obs, reward, done, infos = eval_envs.step(action) # # eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] # for done_ in done]) # for info in infos: # if 'episode' in info.keys(): # eval_episode_rewards.append(info['episode']['r']) # # eval_envs.close() # # print(" Evaluation using {} episodes: mean reward {:.5f}\n". # format(len(eval_episode_rewards), # np.mean(eval_episode_rewards))) # # plotting # if args.vis and j % args.vis_interval == 0: # try: # # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) # except IOError: # pass #if done save::::::::::: logger.save()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) if args.load_policy is not None: actor_critic, ob_rms = torch.load(args.load_policy) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque( maxlen=(args.num_processes if args.num_processes > 10 else 10)) start = time.time() snapshot_counter = 0 last_delete = -1 try: os.makedirs(os.path.join(args.save_dir, args.algo)) except OSError: pass log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'), 'w') for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + "epoch_{:07d}.pt".format(j))) snapshot_counter += 1 last_delete += 1 if snapshot_counter > 100: os.system('rm ' + os.path.join( save_path, args.env_name + 'epoch_{:07d}.py'.format(last_delete))) snapshot_counter -= 1 total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) print(log_info) sys.stdout.flush() log_out_file.write(log_info) log_out_file.flush() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.write( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.flush() sys.stdout.flush() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): all_episode_rewards = [] ### 记录 6/29 all_temp_rewards = [] ### 记录 6/29 args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('num_updates ', num_updates) print('num_steps ', args.num_steps) count = 0 h5_path = './data/' + args.env_name if not os.path.exists(h5_path): os.makedirs(h5_path) h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count) data = {} data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] episode_step = 0 for j in range(num_updates): ### num-steps temp_states = [] temp_actions = [] temp_rewards = [] temp_done = [] temp_lenthgs = [] if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if j == 0 and step == 0: print('obs ', type(rollouts.obs[step]), rollouts.obs[step].shape) print('hidden_states ', type(rollouts.recurrent_hidden_states[step]), rollouts.recurrent_hidden_states[step].shape) print('action ', type(action), action.shape) print('action prob ', type(action_log_prob), action_log_prob.shape) print('-' * 20) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #print(infos) #print(reward) temp_states += [np.array(rollouts.obs[step].cpu())] temp_actions += [np.array(action.cpu())] #temp_rewards += [np.array(reward.cpu())] temp_rewards += [np.array([infos[0]['myrewards']]) ] ### for halfcheetah不能直接用 reward !! 6/29 temp_done += [np.array(done)] if j == 0 and step == 0: print('obs ', type(obs), obs.shape) print('reward ', type(reward), reward.shape) print('done ', type(done), done.shape) print('infos ', len(infos)) for k, v in infos[0].items(): print(k, v.shape) print() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) all_episode_rewards += [info['episode']['r']] ### 记录 6/29 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) temp_lengths = len(temp_states) temp_states = np.concatenate(temp_states) temp_actions = np.concatenate(temp_actions) temp_rewards = np.concatenate(temp_rewards) temp_done = np.concatenate(temp_done) #print('temp_lengths',temp_lengths) #print('temp_states', temp_states.shape) #print('temp_actions', temp_actions.shape) #print('temp_rewards', temp_rewards.shape) if j > int(0.4 * num_updates): data['states'] += [temp_states] data['actions'] += [temp_actions] data['rewards'] += [temp_rewards] data['lengths'] += [temp_lengths] data['done'] += [temp_done] #print('temp_lengths',data['lengths'].shape) #print('temp_states', data['states'].shape) #print('temp_actions', data['actions'].shape) #print('temp_rewards', data['rewards'].shape) if args.save_expert and len(data['states']) >= 100: with h5py.File(h5_filename, 'w') as f: f['states'] = np.array(data['states']) f['actions'] = np.array(data['actions']) f['rewards'] = np.array(data['rewards']) f['done'] = np.array(data['done']) f['lengths'] = np.array(data['lengths']) #print('f_lengths',f['lengths'].shape) #print('f_states', f['states'].shape) #print('f_actions', f['actions'].shape) #print('f_rewards', f['rewards'].shape) count += 1 h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % ( count) data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards) ### 保存记录 6/29 #print(temp_rewards) print("temp rewards size", temp_rewards.shape, "mean", np.mean(temp_rewards), "min", np.min(temp_rewards), "max", np.max(temp_rewards)) all_temp_rewards += [temp_rewards] np.savez(os.path.join(save_path, args.env_name + "_%d" % (args.seed)), episode=all_episode_rewards, timestep=all_temp_rewards) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''data['states'] = np.array(data['states'])
def main(): args = get_args() writer = SummaryWriter(os.path.join('logs', args.save_name), ) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs( basic_env.BasicFlatDiscreteEnv, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, task='lift', gripper_type='RobotiqThreeFingerDexterousGripper', robot='Panda', controller='JOINT_TORQUE' if args.vel else 'JOINT_POSITION', horizon=1000, reward_shaping=True) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base=Surreal, # base=OpenAI, # base=MLP_ATTN, base_kwargs={ 'recurrent': args.recurrent_policy, # 'dims': basic_env.BasicFlatEnv().modality_dims 'config': dict(act='relu' if args.relu else 'tanh', rec=args.rec, fc=args.fc) }) print(actor_critic) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes best_reward = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) writer.add_scalar('lr', agent.optimizer.param_groups[0]['lr']) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps if len(episode_rewards) > 1: writer.add_scalar('loss/value', value_loss, total_num_steps) writer.add_scalar('loss/policy', action_loss, total_num_steps) writer.add_scalar('experiment/num_updates', j, total_num_steps) writer.add_scalar('experiment/FPS', int(total_num_steps / (end - start)), total_num_steps) writer.add_scalar('experiment/EPISODE MEAN', np.mean(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MEDIAN', np.median(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MIN', np.min(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPSIDOE MAX', np.max(episode_rewards), total_num_steps) rollouts.after_update() # save for every interval-th episode or for the last epoch if len(episode_rewards) > 1 and args.save_dir != "": rew = np.mean(episode_rewards) if rew > best_reward: best_reward = rew print('saved with best reward', rew) save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.save_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) writer.close()
def main(argv=None): parser = argparse.ArgumentParser( description='record (images only) sequences with a trained policy') parser.add_argument( '--save-dir', type=str, default=None, ) parser.add_argument('-n', '--num-batches', type=int, default=1) parser.add_argument('-b', '--batch-size', type=int, default=2) parser.add_argument('-s', '--max-steps', type=int, default=1000) parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument( '--env-name', default='PongNoFrameskip-v4', help='environment to train on (default: PongNoFrameskip-v4)') parser.add_argument( '--policy-dir', default='./trained_models/', help='directory to save agent logs (default: ./trained_models/)') parser.add_argument('--non-det', action='store_true', default=False, help='whether to use a non-deterministic policy') args = parser.parse_args(argv) args.det = not args.non_det policy_name = args.env_name + '.pt' if policy_name not in os.listdir(args.policy_dir): print('ERROR: could not find policy in provided policy-dir') found = [p for p in os.listdir(args.policy_dir) if '.pt' in p] if len(found): print( 'Policies must be saved in the provided dir using the template: [env_name].pt' ) print('Found {} policies for other environments though:') for f in found: print(f.split('.')[0]) return 1 env = make_vec_envs(args.env_name, args.seed + 1000, args.batch_size, None, None, device='cpu', allow_early_resets=False) # Get a render function # render_func = get_render_func(env) render_func = env.render # We need to use the same statistics for normalization as used in training actor_critic, ob_rms = torch.load( os.path.join(args.policy_dir, policy_name)) vec_norm = get_vec_normalize(env) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms recurrent_hidden_states = torch.zeros( args.batch_size, actor_critic.recurrent_hidden_state_size) masks = torch.zeros(args.batch_size, 1) obs = env.reset() if args.env_name.find('Bullet') > -1: import pybullet as p torsoId = -1 for i in range(p.getNumBodies()): if (p.getBodyInfo(i)[0].decode() == "torso"): torsoId = i if render_func is None: print('ERROR: No rendering possible') return 1 util.create_dir(args.save_dir) rgb = render_func('rgb_array') # initial img for bidx in range(args.num_batches): print('Generating batch {}/{}'.format(bidx + 1, args.num_batches)) rgbs = [] for step in tqdm(range(args.max_steps)): rgbs.append(rgb) with torch.no_grad(): value, action, _, recurrent_hidden_states = actor_critic.act( obs, recurrent_hidden_states, masks, deterministic=args.det) # Obser reward and next obs obs, reward, done, _ = env.step(action) masks.copy_(torch.from_numpy(~done).unsqueeze(1)) if args.env_name.find('Bullet') > -1: if torsoId > -1: distance = 5 yaw = 0 humanPos, humanOrn = p.getBasePositionAndOrientation( torsoId) p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) rgb = render_func('rgb_array') save_path = os.path.join( args.save_dir, '{}_{}x{}_batch{}.h5'.format(args.env_name, args.batch_size, args.max_steps, bidx)) print('Saving batch {}/{} to {}'.format(bidx + 1, args.num_batches, save_path)) with hf.File(save_path, 'w') as f: f.create_dataset('images', data=np.array(rgbs).T) return 0