def rollout_episode(self, test=False, render=False): rollout = RolloutStorage(self.device) self.reset_env() step = 0 done = False while not done: step += 1 with torch.no_grad(): value, action, action_logprob = self.actor_critic.act( self.current_obs, deterministic=test is True) cpu_actions = action.data.squeeze(1).cpu().numpy()[0] next_obs, reward, done, info = self.env.step(cpu_actions) next_obs = torch.Tensor(next_obs).view(1, -1).to(self.device) if render: self.env.render() # a constant reward scaling factor can be introduced to stabilise training and prevent large value losses r = reward * self.args.reward_scale done = done or step == self.args.episode_max_length mask = 1.0 if not done else 0.0 rollout.insert(self.current_obs, action.data, r, value.data, action_logprob.data, mask) self.current_obs.copy_(next_obs) if not test: next_value = self.actor_critic(self.current_obs)[0].data rollout.compute_returns(next_value, self.args.use_gae, self.args.gamma, self.args.tau) self.episode_steps.append(step) return rollout
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() lmdb_idx = 0 try: os.makedirs(os.path.join(args.lmdb_path, args.env_name)) os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test')) except: print('Directory already exists.') for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs # obs, reward, done, info = envs.step(cpu_actions) '''unwrapped obs, reward''' obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions) # sample images # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2) for img, rwd in zip(wr_obs, wr_reward): if rwd > 0: lmdb_idx += 1 convert_to_lmdb( img, rwd, os.path.join(args.lmdb_path, args.env_name), lmdb_idx) # Evaluate unwrapped rewards # model = Model() # model.load(args.digit_checkpoint) # model.cuda() # accuracy = digit_eval(image, length_labels, digits_labels, model) # img.show() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() # def update2(self, discrim_error): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = -discrim_error - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() # #with reverse # def update2(self, discrim_error, discrim_error_reverse): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() #avg empowrement rather than avg error def update2(self, discrim_error, discrim_error_reverse): # discrim_error: [S,P] # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # next_value = next_value.data # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # print (torch.mean(discrim_error, dim=0)) # print (discrim_error) discrim_error_reverse = discrim_error_reverse.view( self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) #[S,P,1] discrim_error = discrim_error.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse)/2. - action_log_probs.detach() #[S,P,1] val_to_maximize = -discrim_error - action_log_probs.detach() #[S,P,1] val_to_maximize = val_to_maximize.view(self.num_steps, self.num_processes) #[S,P] discrim_error_unmodified = val_to_maximize.data.clone() discrim_error = val_to_maximize.data # self.returns[-1] = next_value divide_by = torch.ones(self.num_processes).cuda() for step in reversed(range(discrim_error.size(0) - 1)): divide_by += 1 ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze( self.rollouts.masks[step + 1]) discrim_error_unmodified[ step] = ttmp + discrim_error_unmodified[step] discrim_error[step] = discrim_error_unmodified[step] / divide_by divide_by = divide_by * torch.squeeze( self.rollouts.masks[step + 1]) val_to_maximize = Variable( discrim_error.view(self.num_steps, self.num_processes, 1)) # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] # advantages = Variable(self.rollouts.returns[:-1]) - values # print (values) # print (discrim_error_reverse.size()) #[S,P] # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = discrim_error baseline = torch.mean(val_to_maximize) advantages = val_to_maximize - baseline #- values #(-.7)#values # value_loss = advantages.pow(2).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() action_loss = -(advantages.detach() * action_log_probs).mean() # print (grad_sum) # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # cost =- grad_sum self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): torch.set_num_threads(1) if args.vis: summary_writer = tf.summary.FileWriter(args.save_dir) envs = [make_env(i, args=args) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1 and args.env_name not in [ 'OverCooked' ]: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def get_onehot(num_class, action): one_hot = np.zeros(num_class) one_hot[action] = 1 one_hot = torch.from_numpy(one_hot).float() return one_hot if args.policy_type == 'shared_policy': actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True, ) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: actor_critic.load_state_dict( torch.load(args.save_dir + '/trained_learner.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 while True: if num_trained_frames > args.num_frames: break for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step], ) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) actor_critic.save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush() elif args.policy_type == 'hierarchical_policy': num_subpolicy = args.num_subpolicy update_interval = args.hierarchy_interval while len(num_subpolicy) < args.num_hierarchy - 1: num_subpolicy.append(num_subpolicy[-1]) while len(update_interval) < args.num_hierarchy - 1: update_interval.append(update_interval[-1]) if args.num_hierarchy == 1: update_interval = [1] num_subpolicy = [envs.action_space.n] # print(envs.action_space.n) # print(stop) actor_critic = {} rollouts = {} actor_critic['top'] = EHRL_Policy(obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), 128, args.recurrent_policy, 'top') rollouts['top'] = EHRL_RolloutStorage( int(args.num_steps / update_interval[-1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), actor_critic['top'].state_size) for hie_id in range(args.num_hierarchy - 1): if hie_id > 0: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( int(args.num_steps / update_interval[hie_id - 1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) else: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( args.num_steps, args.num_processes, obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: for key in actor_critic: actor_critic[key].cuda() agent = {} for ac_key in actor_critic: if args.algo == 'a2c': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent[ac_key] = algo.PPO( actor_critic[ac_key], args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, acktr=True, ) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) for obs_key in rollouts: rollouts[obs_key].observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() for rol_key in rollouts: rollouts[rol_key].cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: for save_key in actor_critic: actor_critic[save_key].load_state_dict( torch.load(args.save_dir + '/trained_learner_' + save_key + '.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 onehot_mem = {} reward_mem = {} if args.num_hierarchy > 1: update_flag = np.zeros(args.num_hierarchy - 1, dtype=np.uint8) else: update_flag = np.zeros(1, dtype=np.uint8) step_count = 0 value = {} next_value = {} action = {} action_log_prob = {} states = {} while True: if num_trained_frames > args.num_frames: break step_count = 0 for step in range(args.num_steps): if step_count % update_interval[-1] == 0: with torch.no_grad(): value['top'], action['top'], action_log_prob[ 'top'], states['top'] = actor_critic['top'].act( rollouts['top'].observations[update_flag[-1]], rollouts['top'].one_hot[update_flag[-1]], rollouts['top'].states[update_flag[-1]], rollouts['top'].masks[update_flag[-1]], ) update_flag[-1] += 1 onehot_mem[str(args.num_hierarchy - 1)] = get_onehot( num_subpolicy[-1], action['top']) onehot_mem[str(args.num_hierarchy)] = get_onehot(1, 0) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[interval_id] == 0: with torch.no_grad(): value[str(interval_id+1)], action[str(interval_id+1)], action_log_prob[str(interval_id+1)], states[str(interval_id+1)] = \ actor_critic[str(interval_id+1)].act( rollouts[str(interval_id+1)].observations[update_flag[interval_id]], rollouts[str(interval_id+1)].one_hot[update_flag[-1]], rollouts[str(interval_id+1)].states[update_flag[interval_id]], rollouts[str(interval_id+1)].masks[update_flag[interval_id]], ) update_flag[interval_id] += 1 onehot_mem[str(interval_id + 1)] = get_onehot( num_subpolicy[interval_id], action[str(interval_id + 1)]) # Sample actions if args.num_hierarchy > 1: with torch.no_grad(): value['0'], action['0'], action_log_prob['0'], states[ '0'] = actor_critic['0'].act( rollouts['0'].observations[step], rollouts['0'].one_hot[step], rollouts['0'].states[step], rollouts['0'].masks[step], ) cpu_actions = action['0'].squeeze(1).cpu().numpy() else: cpu_actions = action['top'].squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) for reward_id in range(args.num_hierarchy - 1): try: reward_mem[str(reward_id)] += [reward_raw[0]] except Exception as e: reward_mem[str(reward_id)] = reward_raw[0] episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) if args.num_hierarchy > 1: rollouts['0'].insert(current_obs, states['0'], action['0'], onehot_mem['1'], action_log_prob['0'], value['0'], reward, masks) if step_count % update_interval[-1] == 0: if args.num_hierarchy > 1: reward_mean = np.mean( np.array(reward_mem[str(args.num_hierarchy - 2)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward_mean, masks) reward_mem[str(args.num_hierarchy - 2)] = [] else: rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward, masks) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[ interval_id] == 0 or done[0]: reward_mean = np.mean( np.array(reward_mem[str(interval_id)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts[str(interval_id + 1)].insert( current_obs, states[str(interval_id + 1)], action[str(interval_id + 1)], onehot_mem[str(interval_id + 2)], action_log_prob[str(interval_id + 1)], value[str(interval_id + 1)], reward_mean, masks) reward_mem[str(interval_id)] = [] step_count += 1 if args.num_hierarchy > 1: with torch.no_grad(): next_value['0'] = actor_critic['0'].get_value( rollouts['0'].observations[-1], rollouts['0'].one_hot[-1], rollouts['0'].states[-1], rollouts['0'].masks[-1], ).detach() rollouts['0'].compute_returns(next_value['0'], args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent['0'].update( rollouts['0'], add_onehot=True) rollouts['0'].after_update() with torch.no_grad(): next_value['top'] = actor_critic['top'].get_value( rollouts['top'].observations[-1], rollouts['top'].one_hot[-1], rollouts['top'].states[-1], rollouts['top'].masks[-1], ).detach() rollouts['top'].compute_returns(next_value['top'], args.use_gae, args.gamma, args.tau) if args.num_hierarchy > 1: _, _, _ = agent['top'].update(rollouts['top'], add_onehot=True) else: value_loss, action_loss, dist_entropy = agent['top'].update( rollouts['top'], add_onehot=True) rollouts['top'].after_update() update_flag[-1] = 0 if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): with torch.no_grad(): next_value[str(interval_id + 1)] = actor_critic[str( interval_id + 1)].get_value( rollouts[str(interval_id + 1)].observations[-1], rollouts[str(interval_id + 1)].one_hot[-1], rollouts[str(interval_id + 1)].states[-1], rollouts[str(interval_id + 1)].masks[-1], ).detach() rollouts[str(interval_id + 1)].compute_returns( next_value[str(interval_id + 1)], args.use_gae, args.gamma, args.tau) _, _, _ = agent[str(interval_id + 1)].update( rollouts[str(interval_id + 1)], add_onehot=True) rollouts[str(interval_id + 1)].after_update() update_flag[interval_id] = 0 num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) for key_store in actor_critic: actor_critic[key].save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush()
def main(): torch.manual_seed(args_seed) torch.cuda.manual_seed_all(args_seed) device = torch.device("cuda:0" if args_cuda else "cpu") train_log = Log(log_name+'_train_log') evl_log = Log(log_name+'_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs( args_env_name, args_seed, args_num_processes, device, gamma=args_gamma) # norm_envs = get_vec_normalize(envs) # norm_envs = envs # norm_envs.eval() # norm_envs.ob_rms = 1 # print(envs.ob_rms) # ss('hi') if is_limit_action: envs.action_space.n = 3 print('Number of Actions:', envs.action_space.n) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args_recurrent_policy}) actor_critic.to(device) # print(actor_critic.is_recurrent) # print(actor_critic.gru) # ss('hi') agent = PPO( actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm, use_clipped_value_loss=args_use_clipped_value_loss) rollouts = RolloutStorage( args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # print(obs) # ss('i am over it') num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): if args_use_linear_lr_decay: # decrease learning rate linearly update_linear_schedule( agent.optimizer, j, num_updates, args_lr) for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # ss('dissecting actor critic. act') # print(action) # print() # action = action + 1 # print(action) # ss('hoiohasdfhioas') if is_limit_action: obs, reward, done, infos = envs.step(action+1) else: obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) # print(done) # print(sum_re[i]) sum_re[i] *= 0 masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args_gamma, args_use_gae, args_gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ob_rms = get_vec_normalize(envs).ob_rms ev_result = evaluate(actor_critic, ob_rms, args_env_name, args_seed, args_num_processes, device, is_limit_action=is_limit_action) ev_log_string = 'steps:'+str(total_num_steps)+'. '+ev_result evl_log.log(ev_log_string)
def main(): global args args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() args.vis = not args.no_vis # Set options if args.path_opt is not None: with open(args.path_opt, 'r') as handle: options = yaml.load(handle) if args.vis_path_opt is not None: with open(args.vis_path_opt, 'r') as handle: vis_options = yaml.load(handle) print('## args') pprint(vars(args)) print('## options') pprint(options) # Put alg_%s and optim_%s to alg and optim depending on commandline options['use_cuda'] = args.cuda options['trial'] = args.trial options['alg'] = options['alg_%s' % args.algo] options['optim'] = options['optim_%s' % args.algo] alg_opt = options['alg'] alg_opt['algo'] = args.algo model_opt = options['model'] env_opt = options['env'] env_opt['env-name'] = args.env_name log_opt = options['logs'] optim_opt = options['optim'] model_opt['time_scale'] = env_opt['time_scale'] if model_opt['mode'] in ['baselinewtheta', 'phasewtheta']: model_opt['theta_space_mode'] = env_opt['theta_space_mode'] model_opt['theta_sz'] = env_opt['theta_sz'] elif model_opt['mode'] in ['baseline_lowlevel', 'phase_lowlevel']: model_opt['theta_space_mode'] = env_opt['theta_space_mode'] # Check asserts assert (model_opt['mode'] in [ 'baseline', 'baseline_reverse', 'phasesimple', 'phasewstate', 'baselinewtheta', 'phasewtheta', 'baseline_lowlevel', 'phase_lowlevel', 'interpolate', 'cyclic', 'maze_baseline', 'maze_baseline_wphase' ]) assert (args.algo in ['a2c', 'ppo', 'acktr']) if model_opt['recurrent_policy']: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' # Set seed - just make the seed the trial number seed = args.trial torch.manual_seed(seed) if args.cuda: torch.cuda.manual_seed(seed) # Initialization num_updates = int(optim_opt['num_frames'] ) // alg_opt['num_steps'] // alg_opt['num_processes'] torch.set_num_threads(1) # Print warning print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") # Set logging / load previous checkpoint logpath = os.path.join(log_opt['log_base'], model_opt['mode'], log_opt['exp_name'], args.algo, args.env_name, 'trial%d' % args.trial) if len(args.resume) > 0: assert (os.path.isfile(os.path.join(logpath, args.resume))) ckpt = torch.load(os.path.join(logpath, 'ckpt.pth.tar')) start_update = ckpt['update_count'] else: # Make directory, check before overwriting if os.path.isdir(logpath): if click.confirm( 'Logs directory already exists in {}. Erase?'.format( logpath, default=False)): os.system('rm -rf ' + logpath) else: return os.system('mkdir -p ' + logpath) start_update = 0 # Save options and args with open(os.path.join(logpath, os.path.basename(args.path_opt)), 'w') as f: yaml.dump(options, f, default_flow_style=False) with open(os.path.join(logpath, 'args.yaml'), 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Save git info as well os.system('git status > %s' % os.path.join(logpath, 'git_status.txt')) os.system('git diff > %s' % os.path.join(logpath, 'git_diff.txt')) os.system('git show > %s' % os.path.join(logpath, 'git_show.txt')) # Set up plotting dashboard dashboard = Dashboard(options, vis_options, logpath, vis=args.vis, port=args.port) # If interpolate mode, choose states if options['model']['mode'] == 'phase_lowlevel' and options['env'][ 'theta_space_mode'] == 'pretrain_interp': all_states = torch.load(env_opt['saved_state_file']) s1 = random.choice(all_states) s2 = random.choice(all_states) fixed_states = [s1, s2] elif model_opt['mode'] == 'interpolate': all_states = torch.load(env_opt['saved_state_file']) s1 = all_states[env_opt['s1_ind']] s2 = all_states[env_opt['s2_ind']] fixed_states = [s1, s2] else: fixed_states = None # Create environments dummy_env = make_env(args.env_name, seed, 0, logpath, options, args.verbose) dummy_env = dummy_env() envs = [ make_env(args.env_name, seed, i, logpath, options, args.verbose, fixed_states) for i in range(alg_opt['num_processes']) ] if alg_opt['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Get theta_sz for models (if applicable) dummy_env.reset() if model_opt['mode'] == 'baseline_lowlevel': model_opt['theta_sz'] = dummy_env.env.theta_sz elif model_opt['mode'] == 'phase_lowlevel': model_opt['theta_sz'] = dummy_env.env.env.theta_sz if 'theta_sz' in model_opt: env_opt['theta_sz'] = model_opt['theta_sz'] # Get observation shape obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * env_opt['num_stack'], *obs_shape[1:]) # Do vec normalize, but mask out what we don't want altered if len(envs.observation_space.shape) == 1: ignore_mask = np.zeros(envs.observation_space.shape) if env_opt['add_timestep']: ignore_mask[-1] = 1 if model_opt['mode'] in [ 'baselinewtheta', 'phasewtheta', 'baseline_lowlevel', 'phase_lowlevel' ]: theta_sz = env_opt['theta_sz'] if env_opt['add_timestep']: ignore_mask[-(theta_sz + 1):] = 1 else: ignore_mask[-theta_sz:] = 1 if args.finetune_baseline: ignore_mask = dummy_env.unwrapped._get_obs_mask() freeze_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() if env_opt['add_timestep']: ignore_mask = np.concatenate([ignore_mask, [1]]) freeze_mask = np.concatenate([freeze_mask, [0]]) ignore_mask = (ignore_mask + freeze_mask > 0).astype(float) envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) else: envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=env_opt['add_timestep'], noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) # Set up algo monitoring alg_filename = os.path.join(logpath, 'Alg.Monitor.csv') alg_f = open(alg_filename, "wt") alg_f.write('# Alg Logging %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] alg_logger = csv.DictWriter(alg_f, fieldnames=alg_fields) alg_logger.writeheader() alg_f.flush() # Create the policy network actor_critic = Policy(obs_shape, envs.action_space, model_opt) if args.cuda: actor_critic.cuda() # Create the agent if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, alg_opt['clip_param'], alg_opt['ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) rollouts = RolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(alg_opt['num_processes'], *obs_shape) # Update agent with loaded checkpoint if len(args.resume) > 0: # This should update both the policy network and the optimizer agent.load_state_dict(ckpt['agent']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] elif len(args.other_resume) > 0: ckpt = torch.load(args.other_resume) # This should update both the policy network agent.actor_critic.load_state_dict(ckpt['agent']['model']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] elif args.finetune_baseline: # Load the model based on the trial number ckpt_base = options['lowlevel']['ckpt'] ckpt_file = ckpt_base + '/trial%d/ckpt.pth.tar' % args.trial ckpt = torch.load(ckpt_file) # Make "input mask" that tells the model which inputs were the same from before and should be copied oldinput_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() # This should update both the policy network agent.actor_critic.load_state_dict_special(ckpt['agent']['model'], oldinput_mask) # Set ob_rms old_rms = ckpt['ob_rms'] old_size = old_rms.mean.size if env_opt['add_timestep']: old_size -= 1 # Only copy the pro state part of it envs.ob_rms.mean[:old_size] = old_rms.mean[:old_size] envs.ob_rms.var[:old_size] = old_rms.var[:old_size] # Inline define our helper function for updating obs def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs # Reset our env and rollouts obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([alg_opt['num_processes'], 1]) final_rewards = torch.zeros([alg_opt['num_processes'], 1]) # Update loop start = time.time() for j in range(start_update, num_updates): for step in range(alg_opt['num_steps']): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs obs, reward, done, info = envs.step(cpu_actions) #pdb.set_trace() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) # Update model and rollouts with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # Add algo updates here alg_info = {} alg_info['value_loss'] = value_loss alg_info['action_loss'] = action_loss alg_info['dist_entropy'] = dist_entropy alg_logger.writerow(alg_info) alg_f.flush() # Save checkpoints total_num_steps = (j + 1) * alg_opt['num_processes'] * alg_opt['num_steps'] #save_interval = log_opt['save_interval'] * alg_opt['log_mult'] save_interval = 100 if j % save_interval == 0: # Save all of our important information save_checkpoint(logpath, agent, envs, j, total_num_steps, args.save_every, final=False) # Print log log_interval = log_opt['log_interval'] * alg_opt['log_mult'] if j % log_interval == 0: end = time.time() print( "{}: Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(options['logs']['exp_name'], j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) # Do dashboard logging vis_interval = log_opt['vis_interval'] * alg_opt['log_mult'] if args.vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs dashboard.visdom_plot() except IOError: pass # Save final checkpoint save_checkpoint(logpath, agent, envs, j, total_num_steps, args.save_every, final=False) # Close logging file alg_f.close()
def main(): global args args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() args.vis = not args.no_vis # Set options if args.path_opt is not None: with open(args.path_opt, 'r') as handle: options = yaml.load(handle) if args.vis_path_opt is not None: with open(args.vis_path_opt, 'r') as handle: vis_options = yaml.load(handle) print('## args') pprint(vars(args)) print('## options') pprint(options) # Load the lowlevel opt and lowlevel_optfile = options['lowlevel']['optfile'] with open(lowlevel_optfile, 'r') as handle: ll_opt = yaml.load(handle) # Whether we should set ll policy to be deterministic or not ll_deterministic = options['lowlevel']['deterministic'] # Put alg_%s and optim_%s to alg and optim depending on commandline options['use_cuda'] = args.cuda options['trial'] = args.trial options['alg'] = options['alg_%s' % args.algo] options['optim'] = options['optim_%s' % args.algo] alg_opt = options['alg'] alg_opt['algo'] = args.algo model_opt = options['model'] env_opt = options['env'] env_opt['env-name'] = args.env_name log_opt = options['logs'] optim_opt = options['optim'] options[ 'lowlevel_opt'] = ll_opt # Save low level options in option file (for logging purposes) # Pass necessary values in ll_opt assert (ll_opt['model']['mode'] in ['baseline_lowlevel', 'phase_lowlevel']) ll_opt['model']['theta_space_mode'] = ll_opt['env']['theta_space_mode'] ll_opt['model']['time_scale'] = ll_opt['env']['time_scale'] # If in many module mode, load the lowlevel policies we want if model_opt['mode'] == 'hierarchical_many': # Check asserts theta_obs_mode = ll_opt['env']['theta_obs_mode'] theta_space_mode = ll_opt['env']['theta_space_mode'] assert (theta_space_mode in [ 'pretrain_interp', 'pretrain_any', 'pretrain_any_far', 'pretrain_any_fromstart' ]) assert (theta_obs_mode == 'pretrain') # Get the theta size theta_sz = options['lowlevel']['num_load'] ckpt_base = options['lowlevel']['ckpt'] # Load checkpoints lowlevel_ckpts = [] for ll_ind in range(theta_sz): if args.change_ll_offset: ll_offset = theta_sz * args.trial else: ll_offset = 0 lowlevel_ckpt_file = ckpt_base + '/trial%d/ckpt.pth.tar' % ( ll_ind + ll_offset) assert (os.path.isfile(lowlevel_ckpt_file)) lowlevel_ckpts.append(torch.load(lowlevel_ckpt_file)) # Otherwise it's one ll polciy to load else: # Get theta_sz for low level model theta_obs_mode = ll_opt['env']['theta_obs_mode'] theta_space_mode = ll_opt['env']['theta_space_mode'] assert (theta_obs_mode in ['ind', 'vector']) if theta_obs_mode == 'ind': if theta_space_mode == 'forward': theta_sz = 1 elif theta_space_mode == 'simple_four': theta_sz = 4 elif theta_space_mode == 'simple_eight': theta_sz = 8 elif theta_space_mode == 'k_theta': theta_sz = ll_opt['env']['num_theta'] elif theta_obs_mode == 'vector': theta_sz = 2 else: raise NotImplementedError else: raise NotImplementedError ll_opt['model']['theta_sz'] = theta_sz ll_opt['env']['theta_sz'] = theta_sz # Load the low level policy params lowlevel_ckpt = options['lowlevel']['ckpt'] assert (os.path.isfile(lowlevel_ckpt)) lowlevel_ckpt = torch.load(lowlevel_ckpt) hl_action_space = spaces.Discrete(theta_sz) # Check asserts assert (args.algo in ['a2c', 'ppo', 'acktr', 'dqn']) assert (optim_opt['hierarchical_mode'] in ['train_highlevel', 'train_both']) if model_opt['recurrent_policy']: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' assert (model_opt['mode'] in ['hierarchical', 'hierarchical_many']) # Set seed - just make the seed the trial number seed = args.trial + 1000 # Make it different than lowlevel seed torch.manual_seed(seed) if args.cuda: torch.cuda.manual_seed(seed) # Initialization num_updates = int(optim_opt['num_frames']) // alg_opt[ 'num_steps'] // alg_opt['num_processes'] // optim_opt['num_ll_steps'] torch.set_num_threads(1) # Print warning print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") # Set logging / load previous checkpoint logpath = os.path.join(log_opt['log_base'], model_opt['mode'], log_opt['exp_name'], args.algo, args.env_name, 'trial%d' % args.trial) if len(args.resume) > 0: assert (os.path.isfile(os.path.join(logpath, args.resume))) ckpt = torch.load(os.path.join(logpath, 'ckpt.pth.tar')) start_update = ckpt['update_count'] else: # Make directory, check before overwriting if os.path.isdir(logpath): if click.confirm( 'Logs directory already exists in {}. Erase?'.format( logpath, default=False)): os.system('rm -rf ' + logpath) else: return os.system('mkdir -p ' + logpath) start_update = 0 # Save options and args with open(os.path.join(logpath, os.path.basename(args.path_opt)), 'w') as f: yaml.dump(options, f, default_flow_style=False) with open(os.path.join(logpath, 'args.yaml'), 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Save git info as well os.system('git status > %s' % os.path.join(logpath, 'git_status.txt')) os.system('git diff > %s' % os.path.join(logpath, 'git_diff.txt')) os.system('git show > %s' % os.path.join(logpath, 'git_show.txt')) # Set up plotting dashboard dashboard = Dashboard(options, vis_options, logpath, vis=args.vis, port=args.port) # Create environments envs = [ make_env(args.env_name, seed, i, logpath, options, args.verbose) for i in range(alg_opt['num_processes']) ] if alg_opt['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Check if we use timestep in low level if 'baseline' in ll_opt['model']['mode']: add_timestep = False elif 'phase' in ll_opt['model']['mode']: add_timestep = True else: raise NotImplementedError # Get shapes dummy_env = make_env(args.env_name, seed, 0, logpath, options, args.verbose) dummy_env = dummy_env() s_pro_dummy = dummy_env.unwrapped._get_pro_obs() s_ext_dummy = dummy_env.unwrapped._get_ext_obs() if add_timestep: ll_obs_shape = (s_pro_dummy.shape[0] + theta_sz + 1, ) ll_raw_obs_shape = (s_pro_dummy.shape[0] + 1, ) else: ll_obs_shape = (s_pro_dummy.shape[0] + theta_sz, ) ll_raw_obs_shape = (s_pro_dummy.shape[0], ) ll_obs_shape = (ll_obs_shape[0] * env_opt['num_stack'], *ll_obs_shape[1:]) hl_obs_shape = (s_ext_dummy.shape[0], ) hl_obs_shape = (hl_obs_shape[0] * env_opt['num_stack'], *hl_obs_shape[1:]) # Do vec normalize, but mask out what we don't want altered # Also freeze all of the low level obs ignore_mask = dummy_env.env._get_obs_mask() freeze_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() freeze_mask = np.concatenate([freeze_mask, [0]]) if ('normalize' in env_opt and not env_opt['normalize']) or args.algo == 'dqn': ignore_mask = 1 - freeze_mask if model_opt['mode'] == 'hierarchical_many': # Actually ignore both ignored values and the low level values # That filtering will happen later ignore_mask = (ignore_mask + freeze_mask > 0).astype(float) envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) else: envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) # Make our helper object for dealing with hierarchical observations hier_utils = HierarchyUtils(ll_obs_shape, hl_obs_shape, hl_action_space, theta_sz, add_timestep) # Set up algo monitoring alg_filename = os.path.join(logpath, 'Alg.Monitor.csv') alg_f = open(alg_filename, "wt") alg_f.write('# Alg Logging %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] alg_logger = csv.DictWriter(alg_f, fieldnames=alg_fields) alg_logger.writeheader() alg_f.flush() ll_alg_filename = os.path.join(logpath, 'AlgLL.Monitor.csv') ll_alg_f = open(ll_alg_filename, "wt") ll_alg_f.write('# Alg Logging LL %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) ll_alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] ll_alg_logger = csv.DictWriter(ll_alg_f, fieldnames=ll_alg_fields) ll_alg_logger.writeheader() ll_alg_f.flush() # Create the policy networks ll_action_space = envs.action_space if args.algo == 'dqn': model_opt['eps_start'] = optim_opt['eps_start'] model_opt['eps_end'] = optim_opt['eps_end'] model_opt['eps_decay'] = optim_opt['eps_decay'] hl_policy = DQNPolicy(hl_obs_shape, hl_action_space, model_opt) else: hl_policy = Policy(hl_obs_shape, hl_action_space, model_opt) if model_opt['mode'] == 'hierarchical_many': ll_policy = ModularPolicy(ll_raw_obs_shape, ll_action_space, theta_sz, ll_opt) else: ll_policy = Policy(ll_obs_shape, ll_action_space, ll_opt['model']) # Load the previous ones here? if args.cuda: hl_policy.cuda() ll_policy.cuda() # Create the high level agent if args.algo == 'a2c': hl_agent = algo.A2C_ACKTR(hl_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': hl_agent = algo.PPO(hl_policy, alg_opt['clip_param'], alg_opt['ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': hl_agent = algo.A2C_ACKTR(hl_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) elif args.algo == 'dqn': hl_agent = algo.DQN(hl_policy, env_opt['gamma'], batch_size=alg_opt['batch_size'], target_update=alg_opt['target_update'], mem_capacity=alg_opt['mem_capacity'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) # Create the low level agent # If only training high level, make dummy agent (just does passthrough, doesn't change anything) if optim_opt['hierarchical_mode'] == 'train_highlevel': ll_agent = algo.Passthrough(ll_policy) elif optim_opt['hierarchical_mode'] == 'train_both': if args.algo == 'a2c': ll_agent = algo.A2C_ACKTR(ll_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['ll_lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': ll_agent = algo.PPO(ll_policy, alg_opt['clip_param'], alg_opt['ll_ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['ll_lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': ll_agent = algo.A2C_ACKTR(ll_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) else: raise NotImplementedError # Make the rollout structures hl_rollouts = RolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], hl_obs_shape, hl_action_space, hl_policy.state_size) ll_rollouts = MaskingRolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], ll_obs_shape, ll_action_space, ll_policy.state_size) hl_current_obs = torch.zeros(alg_opt['num_processes'], *hl_obs_shape) ll_current_obs = torch.zeros(alg_opt['num_processes'], *ll_obs_shape) # Helper functions to update the current obs def update_hl_current_obs(obs): shape_dim0 = hl_obs_shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: hl_current_obs[:, :-shape_dim0] = hl_current_obs[:, shape_dim0:] hl_current_obs[:, -shape_dim0:] = obs def update_ll_current_obs(obs): shape_dim0 = ll_obs_shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: ll_current_obs[:, :-shape_dim0] = ll_current_obs[:, shape_dim0:] ll_current_obs[:, -shape_dim0:] = obs # Update agent with loaded checkpoint if len(args.resume) > 0: # This should update both the policy network and the optimizer ll_agent.load_state_dict(ckpt['ll_agent']) hl_agent.load_state_dict(ckpt['hl_agent']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] else: if model_opt['mode'] == 'hierarchical_many': ll_agent.load_pretrained_policies(lowlevel_ckpts) else: # Load low level agent ll_agent.load_state_dict(lowlevel_ckpt['agent']) # Load ob_rms from low level (but need to reshape it) old_rms = lowlevel_ckpt['ob_rms'] assert (old_rms.mean.shape[0] == ll_obs_shape[0]) # Only copy the pro state part of it (not including thetas or count) envs.ob_rms.mean[:s_pro_dummy. shape[0]] = old_rms.mean[:s_pro_dummy.shape[0]] envs.ob_rms.var[:s_pro_dummy.shape[0]] = old_rms.var[:s_pro_dummy. shape[0]] # Reset our env and rollouts raw_obs = envs.reset() hl_obs, raw_ll_obs, step_counts = hier_utils.seperate_obs(raw_obs) ll_obs = hier_utils.placeholder_theta(raw_ll_obs, step_counts) update_hl_current_obs(hl_obs) update_ll_current_obs(ll_obs) hl_rollouts.observations[0].copy_(hl_current_obs) ll_rollouts.observations[0].copy_(ll_current_obs) ll_rollouts.recent_obs.copy_(ll_current_obs) if args.cuda: hl_current_obs = hl_current_obs.cuda() ll_current_obs = ll_current_obs.cuda() hl_rollouts.cuda() ll_rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([alg_opt['num_processes'], 1]) final_rewards = torch.zeros([alg_opt['num_processes'], 1]) # Update loop start = time.time() for j in range(start_update, num_updates): for step in range(alg_opt['num_steps']): # Step through high level action start_time = time.time() with torch.no_grad(): hl_value, hl_action, hl_action_log_prob, hl_states = hl_policy.act( hl_rollouts.observations[step], hl_rollouts.states[step], hl_rollouts.masks[step]) hl_cpu_actions = hl_action.squeeze(1).cpu().numpy() if args.profile: print('hl act %f' % (time.time() - start_time)) # Get values to use for Q learning hl_state_dqn = hl_rollouts.observations[step] hl_action_dqn = hl_action # Update last ll observation with new theta for proc in range(alg_opt['num_processes']): # Update last observations in memory last_obs = ll_rollouts.observations[ll_rollouts.steps[proc], proc] if hier_utils.has_placeholder(last_obs): new_last_obs = hier_utils.update_theta( last_obs, hl_cpu_actions[proc]) ll_rollouts.observations[ll_rollouts.steps[proc], proc].copy_(new_last_obs) # Update most recent observations (not necessarily the same) assert (hier_utils.has_placeholder( ll_rollouts.recent_obs[proc])) new_last_obs = hier_utils.update_theta( ll_rollouts.recent_obs[proc], hl_cpu_actions[proc]) ll_rollouts.recent_obs[proc].copy_(new_last_obs) assert (ll_rollouts.observations.max().item() < float('inf') and ll_rollouts.recent_obs.max().item() < float('inf')) # Given high level action, step through the low level actions death_step_mask = np.ones([alg_opt['num_processes'], 1]) # 1 means still alive, 0 means dead hl_reward = torch.zeros([alg_opt['num_processes'], 1]) hl_obs = [None for i in range(alg_opt['num_processes'])] for ll_step in range(optim_opt['num_ll_steps']): # Sample actions start_time = time.time() with torch.no_grad(): ll_value, ll_action, ll_action_log_prob, ll_states = ll_policy.act( ll_rollouts.recent_obs, ll_rollouts.recent_s, ll_rollouts.recent_masks, deterministic=ll_deterministic) ll_cpu_actions = ll_action.squeeze(1).cpu().numpy() if args.profile: print('ll act %f' % (time.time() - start_time)) # Observe reward and next obs raw_obs, ll_reward, done, info = envs.step( ll_cpu_actions, death_step_mask) raw_hl_obs, raw_ll_obs, step_counts = hier_utils.seperate_obs( raw_obs) ll_obs = [] for proc in range(alg_opt['num_processes']): if (ll_step == optim_opt['num_ll_steps'] - 1) or done[proc]: ll_obs.append( hier_utils.placeholder_theta( np.array([raw_ll_obs[proc]]), np.array([step_counts[proc]]))) else: ll_obs.append( hier_utils.append_theta( np.array([raw_ll_obs[proc]]), np.array([hl_cpu_actions[proc]]), np.array([step_counts[proc]]))) ll_obs = np.concatenate(ll_obs, 0) ll_reward = torch.from_numpy( np.expand_dims(np.stack(ll_reward), 1)).float() episode_rewards += ll_reward hl_reward += ll_reward # Update values for Q learning and update replay memory time.time() hl_next_state_dqn = torch.from_numpy(raw_hl_obs) hl_reward_dqn = ll_reward hl_isdone_dqn = done if args.algo == 'dqn': hl_agent.update_memory(hl_state_dqn, hl_action_dqn, hl_next_state_dqn, hl_reward_dqn, hl_isdone_dqn, death_step_mask) hl_state_dqn = hl_next_state_dqn if args.profile: print('dqn memory %f' % (time.time() - start_time)) # Update high level observations (only take most recent obs if we haven't see a done before now and thus the value is valid) for proc, raw_hl in enumerate(raw_hl_obs): if death_step_mask[proc].item() > 0: hl_obs[proc] = np.array([raw_hl]) # If done then clean the history of observations masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += ( 1 - masks ) * episode_rewards # TODO - actually not sure if I broke this logic, but this value is not used anywhere episode_rewards *= masks # TODO - I commented this out, which possibly breaks things if num_stack > 1. Fix later if necessary #if args.cuda: # masks = masks.cuda() #if current_obs.dim() == 4: # current_obs *= masks.unsqueeze(2).unsqueeze(2) #else: # current_obs *= masks # Update low level observations update_ll_current_obs(ll_obs) # Update low level rollouts ll_rollouts.insert(ll_current_obs, ll_states, ll_action, ll_action_log_prob, ll_value, ll_reward, masks, death_step_mask) # Update which ones have stepped to the end and shouldn't be updated next time in the loop death_step_mask *= masks # Update high level rollouts hl_obs = np.concatenate(hl_obs, 0) update_hl_current_obs(hl_obs) hl_rollouts.insert(hl_current_obs, hl_states, hl_action, hl_action_log_prob, hl_value, hl_reward, masks) # Check if we want to update lowlevel policy if ll_rollouts.isfull and all([ not hier_utils.has_placeholder( ll_rollouts.observations[ll_rollouts.steps[proc], proc]) for proc in range(alg_opt['num_processes']) ]): # Update low level policy assert (ll_rollouts.observations.max().item() < float('inf')) if optim_opt['hierarchical_mode'] == 'train_both': with torch.no_grad(): ll_next_value = ll_policy.get_value( ll_rollouts.observations[-1], ll_rollouts.states[-1], ll_rollouts.masks[-1]).detach() ll_rollouts.compute_returns(ll_next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) ll_value_loss, ll_action_loss, ll_dist_entropy = ll_agent.update( ll_rollouts) else: ll_value_loss = 0 ll_action_loss = 0 ll_dist_entropy = 0 ll_rollouts.after_update() # Update logger alg_info = {} alg_info['value_loss'] = ll_value_loss alg_info['action_loss'] = ll_action_loss alg_info['dist_entropy'] = ll_dist_entropy ll_alg_logger.writerow(alg_info) ll_alg_f.flush() # Update high level policy start_time = time.time() assert (hl_rollouts.observations.max().item() < float('inf')) if args.algo == 'dqn': hl_value_loss, hl_action_loss, hl_dist_entropy = hl_agent.update( alg_opt['updates_per_step'] ) # TODO - maybe log this loss properly else: with torch.no_grad(): hl_next_value = hl_policy.get_value( hl_rollouts.observations[-1], hl_rollouts.states[-1], hl_rollouts.masks[-1]).detach() hl_rollouts.compute_returns(hl_next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) hl_value_loss, hl_action_loss, hl_dist_entropy = hl_agent.update( hl_rollouts) hl_rollouts.after_update() if args.profile: print('hl update %f' % (time.time() - start_time)) # Update alg monitor for high level alg_info = {} alg_info['value_loss'] = hl_value_loss alg_info['action_loss'] = hl_action_loss alg_info['dist_entropy'] = hl_dist_entropy alg_logger.writerow(alg_info) alg_f.flush() # Save checkpoints total_num_steps = (j + 1) * alg_opt['num_processes'] * alg_opt[ 'num_steps'] * optim_opt['num_ll_steps'] if 'save_interval' in alg_opt: save_interval = alg_opt['save_interval'] else: save_interval = 100 if j % save_interval == 0: # Save all of our important information start_time = time.time() save_checkpoint(logpath, ll_agent, hl_agent, envs, j, total_num_steps) if args.profile: print('save checkpoint %f' % (time.time() - start_time)) # Print log log_interval = log_opt['log_interval'] * alg_opt['log_mult'] if j % log_interval == 0: end = time.time() print( "{}: Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(options['logs']['exp_name'], j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), hl_dist_entropy, hl_value_loss, hl_action_loss)) # Do dashboard logging vis_interval = log_opt['vis_interval'] * alg_opt['log_mult'] if args.vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs dashboard.visdom_plot() except IOError: pass # Save final checkpoint save_checkpoint(logpath, ll_agent, hl_agent, envs, j, total_num_steps) # Close logging file alg_f.close() ll_alg_f.close()
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ['CUDA_VISIBLE_DEVICES'] = "9" if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.use_cell: hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1) ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1) else: hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) if args.cuda: actor_critic=actor_critic.cuda() hs = hs.cuda() ft = ft.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, num_processes=args.num_processes, num_steps=args.num_steps, use_cell=args.use_cell, lenhs=args.lenhs,lenft=args.lenft, plan=args.plan, ac_intv=args.ac_interval, hs_intv=args.hs_interval, ft_intv=args.ft_interval ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, feat_size=512) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() rec_x = [] rec_y = [] file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w') hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda() hs_ind = torch.IntTensor(args.num_processes, 1).zero_() epinfobuf = deque(maxlen=100) start_time = time.time() for j in range(num_updates): print('begin sample, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) for step in range(args.num_steps): # Sample actions with torch.no_grad(): rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start_ind = max(hs_ind[i],step+1-args.lenhs) for ind in range(start_ind,step+1): h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c) hs_info[i,:]=h.view(1,2*actor_critic.hid_size) del h,c gc.collect() else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i]) hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info) value, action, action_log_prob, states = actor_critic.act( hidden_feat, rollouts.states[step]) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, infos = envs.step(cpu_actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo['r']]) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks) with torch.no_grad(): rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start = max(hs_ind[i], step + 1 - args.lenhs) for ind in range(start, step + 1): h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c) hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size) del h,c else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i]) hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info) next_value = actor_critic.get_value(hidden_feat).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) rollouts.compute_ft_ind() print('begin update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) value_loss, action_loss, dist_entropy = agent.update(rollouts) print('end update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps v_mean,v_median,v_min,v_max = safe(epinfobuf) print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), int(total_num_steps / (end - start_time)), v_mean, v_median, v_min, v_max, dist_entropy, value_loss, action_loss)) if not (v_mean==np.nan): rec_x.append(total_num_steps) rec_y.append(v_mean) file.write(str(total_num_steps)) file.write(' ') file.writelines(str(v_mean)) file.write('\n') if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name, args.env_name, args.num_frames) file.close()
action_list = [] for _i in range(netD.required_code_length()): get_value = True if _i == netD.required_code_length( ) - 1 else False # value, action, h_state = cD.act(action, h_state, get_value=get_value) value, action, h_state, action_log_probs, dist_entropy = cD.act_and_evaluate( V(action.data), h_state, get_value=get_value) for cdx, _c in enumerate(action.data.squeeze(1).cpu().numpy()): codesD[cdx].append(_c) action_log_probs_list.append(action_log_probs) dist_entropy_list.append(dist_entropy) if args.ppo: action_list.append(action.data) rolloutsD.insert(torch.stack(action_log_probs_list), torch.stack(dist_entropy_list), value) if args.ppo: rolloutsD.insert_actions(torch.stack(action_list)) for j in range(M2): _data = gen.__next__() netD.zero_grad() # optimizerCD.zero_grad() # train with real _data = _data.reshape(BATCH_SIZE, 3, 32, 32).transpose(0, 2, 3, 1) real_data = torch.stack([preprocess(item) for item in _data]) if use_cuda: real_data = real_data.cuda(gpu)
class PPOAgent(ResearchAgent): """The TensorForceAgent. Acts through the algorith, not here.""" def __init__(self, actor_critic, character=characters.Bomber, **kwargs): self._actor_critic = actor_critic super(PPOAgent, self).__init__(character, **kwargs) def cuda(self): self._actor_critic.cuda() if hasattr(self, "_rollout"): self._rollout.cuda() @property def model(self): return self._actor_critic @property def optimizer(self): return self._optimizer def set_eval(self): self._actor_critic.eval() def set_train(self): self._actor_critic.train() def _rollout_data(self, step, num_agent, num_agent_end=None): if num_agent_end is not None: assert (num_agent_end > num_agent) observations = Variable( self._rollout.observations[step, num_agent:num_agent_end]) states = Variable(self._rollout.states[step, num_agent:num_agent_end]) masks = Variable(self._rollout.masks[step, num_agent:num_agent_end]) else: observations = Variable(self._rollout.observations[step, num_agent], volatile=True) states = Variable(self._rollout.states[step, num_agent], volatile=True) masks = Variable(self._rollout.masks[step, num_agent], volatile=True) return observations, states, masks def actor_critic_act(self, step, num_agent=0, deterministic=False): """Uses the actor_critic to take action. Args: step: The int timestep that we are acting. num_agent: Agent id that's running. Non-zero when agent has copies. Returns: See the actor_critic's act function in model.py. """ # NOTE: Training uses this --> it uses act(..., deterministic=False). return self._actor_critic.act(*self.get_rollout_data(step, num_agent), deterministic=deterministic) def get_rollout_data(self, step, num_agent, num_agent_end=None): return self._rollout_data(step, num_agent, num_agent_end) def actor_critic_call(self, step, num_agent=0): observations, states, masks = self._rollout_data(step, num_agent) return self._actor_critic(observations, states, masks)[0].data def _evaluate_actions(self, observations, states, masks, actions): return self._actor_critic.evaluate_actions(observations, states, masks, actions) def _optimize(self, value_loss, action_loss, dist_entropy, entropy_coef, value_loss_coef, max_grad_norm, kl_loss=None, kl_factor=0, only_value_loss=False, add_nonlin=False): self._optimizer.zero_grad() # only update the value head (to be used when fine tuning a model # trained with BC without a value predictor) -- only beginning of finetuning if only_value_loss: loss = value_loss * value_loss_coef # stop the gradients from flowing through the # parameters that are used to compute the actions (i.e. critic / policy head) # and only backprop the value loss through the value head #(i.e. parameters used exclusively to predict the value) for p in self._actor_critic.parameters(): p.requires_grad = False self._actor_critic.critic_linear.requires_grad = True if add_nonlin: self._actor_critic.fc_critic.requires_grad = True loss.backward() else: loss = value_loss * value_loss_coef + action_loss \ - dist_entropy * entropy_coef if kl_factor > 0 and not use_is: loss += kl_factor * kl_loss loss.backward() nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm) self._optimizer.step() if hasattr(self, '_scheduler'): self._scheduler.step(loss) if only_value_loss: for p in self._actor_critic.parameters(): p.requires_grad = True def halve_lr(self): for i, param_group in enumerate(self._optimizer.param_groups): old_lr = float(param_group['lr']) new_lr = max(old_lr * 0.5, 1e-7) param_group['lr'] = new_lr def compute_advantages(self, next_value_agents, use_gae, gamma, tau): for num_agent, next_value in enumerate(next_value_agents): self._rollout.compute_returns(next_value, use_gae, gamma, tau, num_agent) advantages = self._rollout.compute_advantages() diff = (advantages - advantages.mean()) advantages = diff / (advantages.std() + 1e-5) return advantages def initialize(self, args, obs_shape, action_space, num_training_per_episode, num_episodes, total_steps, num_epoch, optimizer_state_dict, num_steps, uniform_v, uniform_v_prior): params = self._actor_critic.parameters() self._optimizer = optim.Adam(params, lr=args.lr, eps=args.eps) if optimizer_state_dict: self._optimizer.load_state_dict(optimizer_state_dict) if args.use_lr_scheduler: self._scheduler = optim.lr_scheduler.ReduceLROnPlateau( self._optimizer, mode='min', verbose=True) self._rollout = RolloutStorage(num_steps, args.num_processes, obs_shape, action_space, self._actor_critic.state_size, num_training_per_episode) self.num_episodes = num_episodes self.total_steps = total_steps self.num_epoch = num_epoch self.uniform_v = uniform_v self.uniform_v_prior = uniform_v_prior def update_rollouts(self, obs, timestep): self._rollout.observations[timestep, :, :, :, :, :].copy_(obs) def insert_rollouts(self, step, current_obs, states, action, action_log_prob, value, reward, mask, action_log_prob_distr=None, dagger_prob_distr=None, expert_action_log_prob=None, training_action_log_prob=None): self._rollout.insert(step, current_obs, states, action, action_log_prob, value, reward, mask, action_log_prob_distr, dagger_prob_distr, expert_action_log_prob=None, training_action_log_prob=None) def ppo(self, advantages, num_mini_batch, batch_size, num_steps, clip_param, entropy_coef, value_loss_coef, max_grad_norm, action_space, anneal=False, lr=1e-4, eps=1e-5, kl_factor=0, only_value_loss=False, add_nonlin=False, use_is=False, use_retrace=False, lambda_retrace=1.0): action_losses = [] value_losses = [] dist_entropies = [] kl_losses = [] kl_loss = None total_losses = [] if hasattr(self._actor_critic, 'gru'): data_generator = self._rollout.recurrent_generator( advantages, num_mini_batch, batch_size, num_steps, kl_factor, use_is) else: data_generator = self._rollout.feed_forward_generator( advantages, num_mini_batch, batch_size, num_steps, action_space, kl_factor, use_is) for sample in data_generator: observations_batch, states_batch, actions_batch, return_batch, \ masks_batch, old_action_log_probs_batch, adv_targ, \ action_log_probs_distr_batch, dagger_probs_distr_batch, \ expert_action_log_probs_batch, training_action_log_probs_batch \ = sample # Reshape to do in a single forward pass for all steps result = self._evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) values, action_log_probs, dist_entropy, states = result adv_targ = Variable(adv_targ) ratio = action_log_probs ratio -= Variable(old_action_log_probs_batch) ratio = torch.exp(ratio) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) surr2 *= adv_targ action_loss = -torch.min(surr1, surr2).mean() value_loss = (Variable(return_batch) - values) \ .pow(2).mean() total_loss = value_loss * value_loss_coef + action_loss \ - dist_entropy * entropy_coef if kl_factor > 0 and not use_is: criterion = nn.KLDivLoss() kl_loss = criterion(Variable(action_log_probs_distr_batch), Variable(dagger_probs_distr_batch)) total_loss += kl_factor * kl_loss self._optimize(value_loss, action_loss, dist_entropy, entropy_coef, value_loss_coef, max_grad_norm, kl_loss, kl_factor, only_value_loss, add_nonlin) lr = self._optimizer.param_groups[0]['lr'] action_losses.append(action_loss.data[0]) value_losses.append(value_loss.data[0]) dist_entropies.append(dist_entropy.data[0]) if kl_factor > 0 and not use_is: kl_losses.append(kl_loss.data[0]) total_losses.append(total_loss.data[0]) return action_losses, value_losses, dist_entropies, \ kl_losses, total_losses, lr def copy_ex_model(self): """Creates a copy without the model. This is for operating with homogenous training.""" return PPOAgent(None, self._character, num_processes=self._num_processes) def copy_with_model(self): """Creates a copy with the model. This is for operating with frozen backplay.""" return PPOAgent(self._actor_critic, self._character, num_processes=self._num_processes) def after_epoch(self): self._rollout.after_epoch() def set_new_model(self, model, cuda=False): self._actor_critic = model if cuda: self._actor_critic.cuda()
def main(): log_name = 'ppo_no_input_process' train_log = Log(log_name + '_train_log') evl_log = Log(log_name + '_evaluation_log') torch.set_num_threads(1) envs = make_vec_envs(args_env_name, args_seed, args_num_processes) actor_critic = Policy(envs.observation_space.shape, envs.action_space) agent = PPO(actor_critic, args_clip_param, args_ppo_epoch, args_num_mini_batch, args_value_loss_coef, args_entropy_coef, lr=args_lr, eps=args_eps, max_grad_norm=args_max_grad_norm) rollouts = RolloutStorage(args_num_steps, args_num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) num_updates = int( args_num_env_steps) // args_num_steps // args_num_processes episode_rewards = deque(maxlen=10) start = time.time() sum_re = torch.zeros(args_num_processes, 1) for j in range(num_updates): for step in range(args_num_steps): with torch.no_grad(): value, action, action_log_prob\ = actor_critic.act(rollouts.obs[step]) obs, reward, done, infos = envs.step(action) sum_re += reward if any(done): for i in range(len(done)): if done[i]: episode_rewards.append(sum_re[i].item()) sum_re[i] *= 0 masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1]) rollouts.compute_returns(next_value, args_gamma) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args_log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args_num_processes * args_num_steps end = time.time() logstring = "E {}, N_steps {}, FPS {} mean/median" \ " {:.1f}/{:.1f}, min/max {:.1f}/{:.1f}" \ " Entropy {:.5f},V {:.5f},Action {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) # print(logstring) train_log.log(logstring) # if True: if (args_eval_interval is not None and len(episode_rewards) > 1 and j % args_eval_interval == 0): total_num_steps = (j + 1) * args_num_processes * args_num_steps ev_result = evaluate(actor_critic, args_env_name, args_seed, args_num_processes) ev_log_string = 'steps:' + str(total_num_steps) + '. ' + ev_result evl_log.log(ev_log_string)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() viz_1 = Visdom() win = None win1 = None env_name_1 = 'HalfCheetahSmallFoot-v0' args.env_name = 'HalfCheetahSmallLeg-v0' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] envs_1 = [ make_env(env_name_1, args.seed, i, args.log_dir_1) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) envs_1 = SubprocVecEnv(envs_1) else: envs = DummyVecEnv(envs) envs_1 = DummyVecEnv(envs_1) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) envs_1 = VecNormalize(envs_1) #same for both tasks obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space) #same for both tasks action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_critic_1.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) optimizer_1 = optim.RMSprop(actor_critic_1.parameters(), args.lr, eps=args.eps, alpha=args.alpha) #Different for both tasks rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs_1.action_space, actor_critic_1.state_size) current_obs_1 = torch.zeros(args.num_processes, *obs_shape) #Different update functions def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def update_current_obs_1(obs): shape_dim0 = envs_1.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:] current_obs_1[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) obs_1 = envs_1.reset() update_current_obs_1(obs_1) rollouts.observations[0].copy_(current_obs) rollouts_1.observations[0].copy_(current_obs_1) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_rewards_1 = torch.zeros([args.num_processes, 1]) final_rewards_1 = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() current_obs_1 = current_obs_1.cuda() rollouts_1.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions from branch 1 value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) #Sample actions from branch 2 value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act( Variable(rollouts_1.observations[step], volatile=True), Variable(rollouts_1.states[step], volatile=True), Variable(rollouts_1.masks[step], volatile=True)) cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy() obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1) reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1), 1)).float() episode_rewards_1 += reward_1 masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done_1]) final_rewards_1 *= masks_1 final_rewards_1 += (1 - masks_1) * episode_rewards_1 episode_rewards_1 *= masks_1 if args.cuda: masks_1 = masks_1.cuda() if current_obs_1.dim() == 4: current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2) else: current_obs_1 *= masks_1 update_current_obs_1(obs_1) rollouts_1.insert(step, current_obs_1, states_1.data, action_1.data, action_log_prob_1.data, value_1.data, reward_1, masks_1) #Update for branch 1 next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() #share params branch 1 -> branch 2 actor_critic_1.a_fc1.weight.data = copy.deepcopy( actor_critic.a_fc1.weight.data) actor_critic_1.a_fc1.bias.data = copy.deepcopy( actor_critic.a_fc1.bias.data) actor_critic_1.v_fc1.weight.data = copy.deepcopy( actor_critic.v_fc1.weight.data) actor_critic_1.v_fc1.bias.data = copy.deepcopy( actor_critic.v_fc1.bias.data) #Update for branch 2 next_value_1 = actor_critic_1( Variable(rollouts_1.observations[-1], volatile=True), Variable(rollouts_1.states[-1], volatile=True), Variable(rollouts_1.masks[-1], volatile=True))[0].data rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma, args.tau) values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions( Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)), Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)), Variable(rollouts_1.masks[:-1].view(-1, 1)), Variable(rollouts_1.actions.view(-1, action_shape))) values_1 = values_1.view(args.num_steps, args.num_processes, 1) action_log_probs_1 = action_log_probs_1.view(args.num_steps, args.num_processes, 1) advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1 value_loss_1 = advantages_1.pow(2).mean() action_loss_1 = -(Variable(advantages_1.data) * action_log_probs_1).mean() optimizer_1.zero_grad() (value_loss_1 * args.value_loss_coef + action_loss_1 - dist_entropy_1 * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic_1.parameters(), args.max_grad_norm) optimizer_1.step() rollouts_1.after_update() #share params branch 2 -> branch 1 actor_critic.a_fc1.weight.data = copy.deepcopy( actor_critic_1.a_fc1.weight.data) actor_critic.a_fc1.bias.data = copy.deepcopy( actor_critic_1.a_fc1.bias.data) actor_critic.v_fc1.weight.data = copy.deepcopy( actor_critic_1.v_fc1.weight.data) actor_critic.v_fc1.bias.data = copy.deepcopy( actor_critic_1.v_fc1.bias.data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo, args.env_name + '_' + env_name_1) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = actor_critic_1 if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model_1 = copy.deepcopy(actor_critic_1).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] save_model_1 = [ save_model_1, hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) torch.save(save_model_1, os.path.join(save_path, env_name_1 + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) print( "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards_1.mean(), final_rewards_1.median(), final_rewards_1.min(), final_rewards_1.max(), dist_entropy_1.data[0], value_loss_1.data[0], action_loss_1.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1, args.algo) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) fdsafasd # if args.vis: # from visdom import Visdom # viz = Visdom() # win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) # print('here3') # fdasf obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) # elif args.algo == 'ppo': # optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) # elif args.algo == 'acktr': # optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) #set the first state to current state # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() # if args.algo == 'ppo': # old_model = copy.deepcopy(actor_critic) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) # make prediction using state that you put into rollouts cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) # print (state.shape) # [nProcesss, ndims, height, width] # fsdf reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) # insert all that info into current step # not exactly why next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() # if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # # Sampled fisher, see Martens 2014 # actor_critic.zero_grad() # pg_fisher_loss = -action_log_probs.mean() # value_noise = Variable(torch.randn(values.size())) # if args.cuda: # value_noise = value_noise.cuda() # sample_values = values + value_noise # vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() # fisher_loss = pg_fisher_loss + vf_fisher_loss # optimizer.acc_stats = True # fisher_loss.backward(retain_graph=True) # optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() # if args.algo == 'a2c': # nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() # elif args.algo == 'ppo': # advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) # old_model.load_state_dict(actor_critic.state_dict()) # if hasattr(actor_critic, 'obs_filter'): # old_model.obs_filter = actor_critic.obs_filter # for _ in range(args.ppo_epoch): # sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) # for indices in sampler: # indices = torch.LongTensor(indices) # if args.cuda: # indices = indices.cuda() # states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] # actions_batch = rollouts.actions.view(-1, action_shape)[indices] # return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # # Reshape to do in a single forward pass for all steps # values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) # _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) # ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) # adv_targ = Variable(advantages.view(-1, 1)[indices]) # surr1 = ratio * adv_targ # surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ # action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) # value_loss = (Variable(return_batch) - values).pow(2).mean() # optimizer.zero_grad() # (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() # optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(args): env = GymEnvironment(args, gamma) env.env = env.env.unwrapped actor_critic = Policy(obs_shape, env.action_size, base_kwargs={'recurrent': False}) actor_critic.load_state_dict(torch.load('log/model.pt')) actor_critic.to(device) agent = PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr, eps, max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, env.action_space, actor_critic.recurrent_hidden_state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs, _, _, _ = env.new_expt() obs = obs[np.newaxis, ...] current_obs[:, -1] = torch.from_numpy(obs) rollouts.obs[0].copy_(current_obs) current_obs = current_obs.to(device) rollouts.to(device) num_updates = math.ceil(args.max_timesteps / (num_processes * num_steps)) n_goal_reached = 0 n_episodes = 0 for j in range(num_updates): for step in range(num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() (obs, reward, done), goal_reached = env.act(action) reward = torch.from_numpy(np.expand_dims(np.stack([reward]), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in [done]]) masks = masks.to(device) current_obs[:, :-1] = current_obs[:, 1:] if done: current_obs[:] = 0 current_obs[:, -1] = torch.from_numpy(obs) rollouts.insert(current_obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) if done: n_episodes += 1 env.new_expt() if goal_reached: n_goal_reached += 1 with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau, step) value_loss, action_loss, dist_entropy = agent.update(rollouts, step) rollouts.after_update() if j % log_interval == 0: total_num_steps = (j + 1) * num_processes * num_steps try: success = float(n_goal_reached) / n_episodes except ZeroDivisionError: success = 0. print( "Timesteps: {}, Goal reached : {} / {}, Success %: {}".format( total_num_steps, n_goal_reached, n_episodes, success)) if args.lang_coeff > 0: av_list = np.array(env.action_vectors_list) for k in range(len(spearman_corr_coeff_actions)): sr, _ = spearmanr(env.rewards_list, av_list[:, k]) print(k, sr)
class a2c(object): def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.action_size = hparams['action_size'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) self.actor_critic = CNNPolicy(self.obs_shape[0], self.action_size) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.action_size) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] # print ('aaa') # print (self.actor_critic.act(current_state)) value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) # print ('lll') return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1] / 255., volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): print("#######") print( "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler( range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view( -1, *obs_shape)[indices] actions_batch = rollouts.actions.view( -1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions( Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print( "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, (j + 1) * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if j % args.vis_interval == 0: win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") """ if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None """ envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) """ for info in infos: if 'episode' in info.keys(): print(reward) episode_rewards.append(info['episode']['r']) """ # FIXME: works only for environments with sparse rewards for idx, eps_done in enumerate(done): if eps_done: episode_rewards.append(reward[idx]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('Saving model') print() save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), np.count_nonzero(np.greater(episode_rewards, 0)) / len(episode_rewards))) if args.eval_interval is not None and len( episode_rewards) > 1 and j % args.eval_interval == 0: eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) if eval_envs.venv.__class__.__name__ == "VecNormalize": eval_envs.venv.ob_rms = envs.venv.ob_rms # An ugly hack to remove updates def _obfilt(self, obs): if self.ob_rms: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv) eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass """ envs.close()
def main(): print("###############################################################") print("#################### VISDOOM LEARNER START ####################") print("###############################################################") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None global envs envs = VecEnv( [make_env(i, args.config_path) for i in range(args.num_processes)], logging=True, log_dir=args.log_dir) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'a2c' or args.algo == 'acktr': actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) elif args.algo == 'a2t': source_models = [] files = glob.glob(os.path.join(args.source_models_path, '*.pt')) for file in files: print(file, 'loading model...') source_models.append(torch.load(file)) actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape, source_models) elif args.algo == 'resnet': # args.num_stack = 3 actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() if args.algo == 'a2c' or args.algo == 'resnet': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'a2t': a2t_params = [p for p in actor_critic.parameters() if p.requires_grad] optimizer = optim.RMSprop(a2t_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # print ('Actions:', cpu_actions, 'Rewards:', reward) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c' or args.algo == 'resnet': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) elif args.algo == 'a2t': nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo) except IOError: pass envs.close() time.sleep(5)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") if args.run_index is not None: load_params(args) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) num_heads = 1 if args.reward_predictor else len(args.gamma) assert len(envs.observation_space.shape) == 3 actor_critic = CNNPolicy(obs_shape[0], envs.action_space, use_rp=args.reward_predictor, num_heads=num_heads) assert envs.action_space.__class__.__name__ == "Discrete" action_shape = 1 if args.cuda: actor_critic.cuda() if not args.reward_predictor: model_params = actor_critic.parameters() else: lrs = [args.lr_rp, args.lr] model_params = [{ 'params': model_p, 'lr': p_lr } for model_p, p_lr in zip(actor_critic.param_groups, lrs)] optimizer = optim.RMSprop(model_params, args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, gamma=args.gamma, use_rp=args.reward_predictor) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, raw_reward, done, info = envs.step(cpu_actions) if args.reward_noise > 0.0: stds = np.ones(raw_reward.shape) * args.reward_noise noise = np.random.normal(loc=0.0, scale=stds) reward = raw_reward + noise else: reward = raw_reward raw_reward = torch.from_numpy( np.expand_dims(np.stack(raw_reward), 1)).float() episode_rewards += raw_reward reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() if args.reward_predictor: p_hat = min(args.rp_burn_in, j) / args.rp_burn_in estimate_reward = ( 1 - p_hat ) * reward + p_hat * value[:, 0].unsqueeze(-1).data.cpu() reward = torch.cat([reward, estimate_reward], dim=-1) value = value.data else: value = value.data # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value) states = Variable(rollouts.states[0].view(-1, actor_critic.state_size)) masks = Variable(rollouts.masks[:-1].view(-1, 1)) obs = Variable(rollouts.observations[:-1].view(-1, *obs_shape)) actions = Variable(rollouts.actions.view(-1, action_shape)) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( obs, states, masks, actions) returns_as_variable = Variable(rollouts.returns[:-1]) values = values.view(returns_as_variable.size()) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = returns_as_variable - values value_loss = advantages.pow(2).sum(-1).mean() action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, 'a2c') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, " "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(): print("#######") print("WARNING: All rewards are not clipped or normalized ") print("#######") os.environ['OMP_NUM_THREADS'] = '1' envs = rafiki.Envs(args.num_processes, args.num_models, args.policy, args.beta, args.obs_size, args.max_latency, args.tau, args.cycle_len) obs_shape = envs.observation_space.shape actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() info_set = Info(args) for j in range(num_updates): for step in range(args.num_steps): logger.info('------------%d----------------' % j) # Sample actions with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[step])) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs logger.info(probs) obs, reward, info = envs.step(cpu_actions) info_set.insert(info) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() update_current_obs(obs) rollouts.insert(step, current_obs, action.data, action_log_prob.data, reward) if args.algo in ['a2c', 'ppo']: action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) R = rollouts.rewards.detach() optimizer.zero_grad() policy_loss = -R.reshape(args.num_steps, args.num_processes).mul(action_log_probs) policy_loss = sum(policy_loss) / len(policy_loss) policy_loss.backward() # nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm) optimizer.step() with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[-1])) logger.info(probs) rollouts.after_update() if j % args.log_interval == 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, reward {}, policy loss {}". format(j, total_num_steps, R.data, policy_loss.reshape(-1).data)) logger.info(args) info_set.show()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], envs.action_space) else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape self.action_shape = 1 # if __: # self.deterministic_action = 0 # else: # self.deterministic_action = 0 if hparams['gif_'] or hparams['ls_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] # print ('aaa') # print (self.actor_critic.act(current_state)) value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) # print ('lll') return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): print("######") print("HELLO! Returns start with infinity values") print("######") os.environ['OMP_NUM_THREADS'] = '1' if args.random_task: env_params = { 'wt': np.round(np.random.uniform(0.5, 1.0), 2), 'x': np.round(np.random.uniform(-0.1, 0.1), 2), 'y': np.round(np.random.uniform(-0.1, 0.1), 2), 'z': np.round(np.random.uniform(0.15, 0.2), 2), } else: env_params = { 'wt': args.euclidean_weight, 'x': args.goal_x, 'y': args.goal_y, 'z': args.goal_z, } envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, ob=False) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() actor_critic.input_norm.update(rollouts.observations[0]) last_return = -np.inf best_return = -np.inf best_models = None start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) actor_critic.input_norm.update(rollouts.observations[step + 1]) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if args.vis and j % args.vis_interval == 0: last_return = plot(logger, args.log_dir) if last_return > best_return: best_return = last_return try: os.makedirs(os.path.dirname(args.save_path)) except OSError: pass info = { 'return': best_return, 'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon) } # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save((save_model, env_params, info), args.save_path) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), last_return, best_return, value_loss.data[0], action_loss.data[0]))
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) action_shape = envs.action_space.shape[0] print('+++++++++++++++++++++++++++++++++++++') print('obs_shape:', obs_shape) print('action_shape:', action_shape) print('+++++++++++++++++++++++++++++++++++++') if args.cuda: actor_critic.cuda() optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) print('%3d [%3d %3d %3d %3d] %3d' % (step, int(envs.convert_2_real_action(cpu_actions)[0, 0]), int(envs.convert_2_real_action(cpu_actions)[0, 1]), int(envs.convert_2_real_action(cpu_actions)[0, 2]), int(envs.convert_2_real_action(cpu_actions)[0, 3]), reward[0])) if reward[0] >= search_done_reward: sys.exit() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] actions_batch = rollouts.actions.view(-1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, j * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(): print('Preparing parameters') torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print('Creating envs: {}'.format(args.env_name)) envs = test_mp_envs(args.env_name, args.num_processes) print('Creating network') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print('Initializing PPO') agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) print('Memory') rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] num_episodes = [0 for _ in range(args.num_processes)] last_index = 0 print('Starting ! ') start = time.time() for j in tqdm(range(num_updates)): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.masks[step]) obs, reward, done, infos = envs.step(action) for info_num, info in enumerate(infos): if info_num == 0: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # end_episode_to_viz(writer, info, info_num, num_episodes[info_num]) num_episodes[info_num] += 1 plot_rewards(episode_rewards, args) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) losses = agent.update(rollouts) rollouts.after_update()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") if args.render_game: mp.set_start_method('spawn') torch.set_num_threads(1) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if os.path.isfile(f): os.remove(f) if 'MiniPacman' in args.env_name: from environment_model.mini_pacman.builder import MiniPacmanEnvironmentBuilder builder = MiniPacmanEnvironmentBuilder(args) else: from environment_model.latent_space.builder import LatentSpaceEnvironmentBuilder builder = LatentSpaceEnvironmentBuilder(args) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None visdom_plotter = VisdomPlotterA2C(viz, args.algo == 'i2a') if 'MiniPacman' in args.env_name: from gym_envs.envs_mini_pacman import make_custom_env envs = [ make_custom_env(args.env_name, args.seed, i, args.log_dir, grey_scale=args.grey_scale) for i in range(args.num_processes) ] elif args.algo == 'i2a' or args.train_on_200x160_pixel: from gym_envs.envs_ms_pacman import make_env_ms_pacman envs = [ make_env_ms_pacman(env_id=args.env_name, seed=args.seed, rank=i, log_dir=args.log_dir, grey_scale=False, stack_frames=1, skip_frames=4) for i in range(args.num_processes) ] else: from envs import make_env envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'i2a' and 'MiniPacman' in args.env_name: actor_critic = builder.build_i2a_model(envs, args) elif args.algo == 'i2a': actor_critic = builder.build_i2a_model(envs, args) elif 'MiniPacman' in args.env_name: actor_critic = builder.build_a2c_model(envs) elif args.train_on_200x160_pixel: from a2c_models.atari_model import AtariModel actor_critic = A2C_PolicyWrapper( AtariModel(obs_shape=obs_shape, action_space=envs.action_space.n, use_cuda=args.cuda)) else: actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.load_model: load_path = os.path.join(args.save_dir, args.algo) load_path = os.path.join(load_path, args.env_name + ".pt") if os.path.isfile(load_path): # if args.cuda: saved_state = torch.load(load_path, map_location=lambda storage, loc: storage) actor_critic.load_state_dict(saved_state) else: print("Can not load model ", load_path, ". File does not exists") return log_file = os.path.join(os.path.join(args.save_dir, args.algo), args.env_name + ".log") if not os.path.exists(log_file) or not args.load_model: print("Log file: ", log_file) with open(log_file, 'w') as the_file: the_file.write('command line args: ' + " ".join(sys.argv) + '\n') if args.cuda: actor_critic.cuda() if args.render_game: load_path = os.path.join(args.save_dir, args.algo) test_process = TestPolicy(model=copy.deepcopy(actor_critic), load_path=load_path, args=args) if args.algo == 'i2a': agent = I2A_ALGO(actor_critic=actor_critic, obs_shape=obs_shape, action_shape=action_shape, args=args) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo == 'i2a': rollouts = I2A_RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): if args.algo == 'i2a': # Sample actions value, action, action_log_prob, states, policy_action_prob, rollout_action_prob = actor_critic.act( rollouts.observations[step].clone(), rollouts.states[step], rollouts.masks[step]) else: # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) if args.algo == "i2a": rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks, policy_action_prob, rollout_action_prob) else: rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo == 'i2a': value_loss, action_loss, dist_entropy, distill_loss = agent.update( rollouts=rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if args.vis: distill_loss_data = distill_loss if args.algo == 'i2a' else None visdom_plotter.append(dist_entropy, final_rewards.numpy().flatten(), value_loss, action_loss, distill_loss_data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic torch.save(save_model.state_dict(), os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps reward_info = "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"\ .format(final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max()) distill_loss = ", distill_loss {:.5f}".format( distill_loss) if args.algo == 'i2a' else "" loss_info = "value loss {:.5f}, policy loss {:.5f}{}"\ .format(value_loss, action_loss, distill_loss) entropy_info = "entropy {:.5f}".format(dist_entropy) info = "Updates {}, num timesteps {}, FPS {}, {}, {}, {}, time {:.5f} min"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), reward_info, entropy_info, loss_info, (end - start) / 60.) with open(log_file, 'a') as the_file: the_file.write(info + '\n') print(info) if args.vis and j % args.vis_interval == 0: frames = j * args.num_processes * args.num_steps visdom_plotter.plot(frames)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): config = None args = get_args() config, checkpoint = get_config_and_checkpoint(args) set_random_seeds(args, config) eval_log_dir = args.save_dir + "_eval" try: os.makedirs(args.save_dir) os.makedirs(eval_log_dir) except OSError: pass now = datetime.datetime.now() experiment_name = args.experiment_name + '_' + now.strftime( "%Y-%m-%d_%H-%M-%S") # Create checkpoint file save_dir_model = os.path.join(args.save_dir, 'model', experiment_name) save_dir_config = os.path.join(args.save_dir, 'config', experiment_name) try: os.makedirs(save_dir_model) os.makedirs(save_dir_config) except OSError as e: logger.error(e) exit() if args.config: shutil.copy2(args.config, save_dir_config) # Tensorboard Logging writer = SummaryWriter( os.path.join(args.save_dir, 'tensorboard', experiment_name)) # Logger that writes to STDOUT and a file in the save_dir logger = setup_carla_logger(args.save_dir, experiment_name) device = torch.device("cuda:0" if args.cuda else "cpu") norm_reward = not config.no_reward_norm norm_obs = not config.no_obs_norm assert not (config.num_virtual_goals > 0) or ( config.reward_class == 'SparseReward'), 'Cant use HER with dense reward' obs_converter = CarlaObservationConverter( h=84, w=84, rel_coord_system=config.rel_coord_system) action_converter = CarlaActionsConverter(config.action_type) envs = make_vec_envs(obs_converter, action_converter, args.starting_port, config.seed, config.num_processes, config.gamma, device, config.reward_class, num_frame_stack=1, subset=config.experiments_subset, norm_reward=norm_reward, norm_obs=norm_obs, apply_her=config.num_virtual_goals > 0, video_every=args.video_interval, video_dir=os.path.join(args.save_dir, 'video', experiment_name)) if config.agent == 'forward': agent = agents.ForwardCarla() if config.agent == 'a2c': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm) elif config.agent == 'acktr': agent = agents.A2CCarla(obs_converter, action_converter, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, acktr=True) elif config.agent == 'ppo': agent = agents.PPOCarla(obs_converter, action_converter, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) if checkpoint is not None: load_modules(agent.optimizer, agent.model, checkpoint) rollouts = RolloutStorage(config.num_steps, config.num_processes, envs.observation_space, envs.action_space, 20, config.num_virtual_goals, config.rel_coord_system, obs_converter) obs = envs.reset() # Save the first observation obs = obs_to_dict(obs) rollouts.obs = obs_to_dict(rollouts.obs) for k in rollouts.obs: rollouts.obs[k][rollouts.step + 1].copy_(obs[k]) rollouts.obs = dict_to_obs(rollouts.obs) rollouts.to(device) start = time.time() total_steps = 0 total_episodes = 0 total_reward = 0 episode_reward = torch.zeros(config.num_processes) for j in range(config.num_updates): for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = agent.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, info = envs.step(action) # For logging purposes carla_rewards = torch.tensor([i['carla-reward'] for i in info], dtype=torch.float) episode_reward += carla_rewards total_reward += carla_rewards.sum().item() total_steps += config.num_processes if done.any(): total_episodes += done.sum() torch_done = torch.tensor(done.astype(int)).byte() mean_episode_reward = episode_reward[torch_done].mean().item() logger.info('{} episode(s) finished with reward {}'.format( done.sum(), mean_episode_reward)) writer.add_scalar('train/mean_ep_reward_vs_steps', mean_episode_reward, total_steps) writer.add_scalar('train/mean_ep_reward_vs_episodes', mean_episode_reward, total_episodes) episode_reward[torch_done] = 0 # If done then clean the history of observations. masks = torch.FloatTensor(1 - done) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks.unsqueeze(-1)) if config.num_virtual_goals > 0: rollouts.apply_her(config.num_virtual_goals, device, beta=config.beta) with torch.no_grad(): next_value = agent.get_value( rollouts.get_obs(-1), # Get last observation rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "" and config.agent != 'forward': save_path = os.path.join(save_dir_model, str(j) + '.pth.tar') save_modules(agent.optimizer, agent.model, args, config, save_path) total_num_steps = (j + 1) * config.num_processes * config.num_steps if j % args.log_interval == 0: # Logging to the stdout/our logs end = time.time() logger.info('------------------------------------') logger.info('Episodes {}, Updates {}, num timesteps {}, FPS {}'\ .format(total_episodes, j + 1, total_num_steps, total_num_steps / (end - start))) logger.info('------------------------------------') # Logging to tensorboard writer.add_scalar('train/cum_reward_vs_steps', total_reward, total_steps) writer.add_scalar('train/cum_reward_vs_updates', total_reward, j + 1) if config.agent in ['a2c', 'acktr', 'ppo']: writer.add_scalar('debug/value_loss_vs_steps', value_loss, total_steps) writer.add_scalar('debug/value_loss_vs_updates', value_loss, j + 1) writer.add_scalar('debug/action_loss_vs_steps', action_loss, total_steps) writer.add_scalar('debug/action_loss_vs_updates', action_loss, j + 1) writer.add_scalar('debug/dist_entropy_vs_steps', dist_entropy, total_steps) writer.add_scalar('debug/dist_entropy_vs_updates', dist_entropy, j + 1) # Sample the last reward writer.add_scalar('debug/sampled_normalized_reward_vs_steps', reward.mean(), total_steps) writer.add_scalar('debug/sampled_normalized_reward_vs_updates', reward.mean(), j + 1) writer.add_scalar('debug/sampled_carla_reward_vs_steps', carla_rewards.mean(), total_steps) writer.add_scalar('debug/sampled_carla_reward_vs_updates', carla_rewards.mean(), j + 1) if (args.eval_interval is not None and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.starting_port, obs_converter, args.x + config.num_processes, config.num_processes, config.gamma, eval_log_dir, config.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(config.num_processes, 20, device=device) eval_masks = torch.zeros(config.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = agent.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs carla_obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() logger.info( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards)))
masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim == 4: # when using image.. current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) # win = plot_rewards(viz, win, reward_step, 'pyBullet Reacher') episodic_reward_graph.append(final_rewards.numpy()[0][0]) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": print('save model!!')
def main(): args = get_args() args.num_processes = 16 args.env_name = 'BreakoutNoFrameskip-v4' torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) # fsdaf # Create environment envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # action_shape = action_shape # shape_dim0 = envs.observation_space.shape[0] # if args.cuda: # dtype = torch.cuda.FloatTensor # else: # dtype = torch.FloatTensor hparams = {'cuda':args.cuda, 'num_steps':args.num_steps, 'num_processes':args.num_processes, 'obs_shape':obs_shape, 'lr':args.lr, 'eps':args.eps, 'alpha':args.alpha, 'use_gae':args.use_gae, 'gamma':args.gamma, 'tau':args.tau, 'value_loss_coef':args.value_loss_coef, 'entropy_coef':args.entropy_coef} # Create agent # agent = a2c(envs, hparams) # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if args.cuda: actor_critic.cuda() # rollouts.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # Init state current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype) def update_current_state(state):#, shape_dim0): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state # return current_state state = envs.reset() update_current_state(state)#, shape_dim0) # agent.insert_first_state(current_state) rollouts.states[0].copy_(current_state) #set the first state to current state # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda()#type(dtype) # if args.cuda: rollouts.cuda() #Begin training start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Act # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width] # Record rewards # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks # return reward, masks, final_rewards, episode_rewards, current_state # Update state update_current_state(state)#, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks) rollouts.insert(step, current_state, action.data, value.data, reward, masks) #Optimize agent # agent.update() next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # #Save model # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #Print updates if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # final_rewards.mean(), # final_rewards.median(), # final_rewards.min(), # final_rewards.max(), # end - start))#, -dist_entropy.data[0], # # value_loss.data[0], action_loss.data[0])) # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}". # format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start)) if j % (args.log_interval*30) == 0: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))
class a2c(object): def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: # self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() def no_update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() self.optimizer.zero_grad()