def reset_envs(storage_length): rollouts = RolloutStorage(num_tasks, storage_length, num_processes_per_task, obs_shape, envs.action_space, loss) current_obs = torch.zeros(num_tasks, num_processes_per_task, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, num_stack, num_tasks, num_processes_per_task) for task in range(num_tasks): rollouts.obs[task, 0].copy_(current_obs[task]) if cuda: current_obs = current_obs.cuda() rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_tasks, num_processes_per_task, 1]) final_rewards = torch.zeros([num_tasks, num_processes_per_task, 1]) episode_length = torch.zeros([num_tasks, num_processes_per_task, 1]) final_length = torch.zeros([num_tasks, num_processes_per_task, 1]) episode_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) final_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) master_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) final_master_terminations = torch.zeros( [num_tasks, num_processes_per_task, 1]) return (rollouts, current_obs, episode_rewards, final_rewards, episode_length, final_length, episode_terminations, final_terminations, master_terminations, final_master_terminations)
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] if hparams['dropout'] == True: print ('CNNPolicy_dropout2') actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space) elif len(envs.observation_space.shape) == 3: print ('CNNPolicy2') actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() if self.opt == 'rms': self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') self.actor_critic = actor_critic self.rollouts = rollouts self.rollouts_list = RolloutStorage_list()
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] if hparams['dropout'] == True: print ('CNNPolicy_dropout2') actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space) elif len(envs.observation_space.shape) == 3: print ('CNNPolicy2') actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() if self.opt == 'rms': self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') self.actor_critic = actor_critic self.rollouts = rollouts self.rollouts_list = RolloutStorage_list()
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.ppo_epoch = hparams['ppo_epoch'] self.batch_size = hparams['batch_size'] self.clip_param = hparams['clip_param'] if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() self.eps = hparams['eps'] # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # if hparams['lr_schedule'] == 'linear': self.init_lr = hparams['lr'] self.final_lr = hparams['final_lr'] # lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr) # self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func) # self.current_lr = hparams['lr'] self.actor_critic = actor_critic self.rollouts = rollouts self.old_model = copy.deepcopy(self.actor_critic)
def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.ppo_epoch = hparams['ppo_epoch'] self.batch_size = hparams['batch_size'] self.clip_param = hparams['clip_param'] if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] self.action_shape = action_shape rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if self.cuda: actor_critic.cuda() rollouts.cuda() self.eps = hparams['eps'] # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) # if hparams['lr_schedule'] == 'linear': self.init_lr = hparams['lr'] self.final_lr = hparams['final_lr'] # lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr) # self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func) # self.current_lr = hparams['lr'] self.actor_critic = actor_critic self.rollouts = rollouts self.old_model = copy.deepcopy(self.actor_critic)
def __init__(self, envs, cuda, num_steps, num_processes, obs_shape, lr, eps, alpha, use_gae, gamma, tau, value_loss_coef, entropy_coef): if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: actor_critic.cuda() # if args.algo == 'a2c': self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps, alpha) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if cuda: rollouts.cuda() self.actor_critic = actor_critic self.rollouts = rollouts self.use_gae = use_gae self.gamma = gamma self.tau = tau self.obs_shape = obs_shape self.action_shape = action_shape self.num_steps = num_steps self.num_processes = num_processes self.value_loss_coef = value_loss_coef self.entropy_coef = entropy_coef
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) action_shape = envs.action_space.shape[0] print('+++++++++++++++++++++++++++++++++++++') print('obs_shape:', obs_shape) print('action_shape:', action_shape) print('+++++++++++++++++++++++++++++++++++++') if args.cuda: actor_critic.cuda() optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) print('%3d [%3d %3d %3d %3d] %3d' % (step, int(envs.convert_2_real_action(cpu_actions)[0, 0]), int(envs.convert_2_real_action(cpu_actions)[0, 1]), int(envs.convert_2_real_action(cpu_actions)[0, 2]), int(envs.convert_2_real_action(cpu_actions)[0, 3]), reward[0])) if reward[0] >= search_done_reward: sys.exit() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] actions_batch = rollouts.actions.view(-1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, j * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) if 'traj_action_mask' in hparams and hparams['traj_action_mask']: self.actor_critic = CNNPolicy_trajectory_action_mask( self.obs_shape[0], envs.action_space) else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape self.action_shape = 1 # if __: # self.deterministic_action = 0 # else: # self.deterministic_action = 0 if hparams['gif_'] or hparams['ls_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] # print ('aaa') # print (self.actor_critic.act(current_state)) value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) # print ('lll') return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
class a2c(object): def __init__(self, hparams): self.obs_shape = hparams['obs_shape'] self.n_actions = hparams['n_actions'] self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.actor_critic = CNNPolicy(self.obs_shape[0], self.n_actions) #.cuda() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.n_actions) # if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: # self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() def no_update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() self.optimizer.zero_grad()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") if args.run_index is not None: load_params(args) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) num_heads = 1 if args.reward_predictor else len(args.gamma) assert len(envs.observation_space.shape) == 3 actor_critic = CNNPolicy(obs_shape[0], envs.action_space, use_rp=args.reward_predictor, num_heads=num_heads) assert envs.action_space.__class__.__name__ == "Discrete" action_shape = 1 if args.cuda: actor_critic.cuda() if not args.reward_predictor: model_params = actor_critic.parameters() else: lrs = [args.lr_rp, args.lr] model_params = [{ 'params': model_p, 'lr': p_lr } for model_p, p_lr in zip(actor_critic.param_groups, lrs)] optimizer = optim.RMSprop(model_params, args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, gamma=args.gamma, use_rp=args.reward_predictor) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, raw_reward, done, info = envs.step(cpu_actions) if args.reward_noise > 0.0: stds = np.ones(raw_reward.shape) * args.reward_noise noise = np.random.normal(loc=0.0, scale=stds) reward = raw_reward + noise else: reward = raw_reward raw_reward = torch.from_numpy( np.expand_dims(np.stack(raw_reward), 1)).float() episode_rewards += raw_reward reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() if args.reward_predictor: p_hat = min(args.rp_burn_in, j) / args.rp_burn_in estimate_reward = ( 1 - p_hat ) * reward + p_hat * value[:, 0].unsqueeze(-1).data.cpu() reward = torch.cat([reward, estimate_reward], dim=-1) value = value.data else: value = value.data # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value) states = Variable(rollouts.states[0].view(-1, actor_critic.state_size)) masks = Variable(rollouts.masks[:-1].view(-1, 1)) obs = Variable(rollouts.observations[:-1].view(-1, *obs_shape)) actions = Variable(rollouts.actions.view(-1, action_shape)) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( obs, states, masks, actions) returns_as_variable = Variable(rollouts.returns[:-1]) values = values.view(returns_as_variable.size()) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = returns_as_variable - values value_loss = advantages.pow(2).sum(-1).mean() action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, 'a2c') try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, " "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(): global args args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() args.vis = not args.no_vis # Set options if args.path_opt is not None: with open(args.path_opt, 'r') as handle: options = yaml.load(handle) if args.vis_path_opt is not None: with open(args.vis_path_opt, 'r') as handle: vis_options = yaml.load(handle) print('## args') pprint(vars(args)) print('## options') pprint(options) # Put alg_%s and optim_%s to alg and optim depending on commandline options['use_cuda'] = args.cuda options['trial'] = args.trial options['alg'] = options['alg_%s' % args.algo] options['optim'] = options['optim_%s' % args.algo] alg_opt = options['alg'] alg_opt['algo'] = args.algo model_opt = options['model'] env_opt = options['env'] env_opt['env-name'] = args.env_name log_opt = options['logs'] optim_opt = options['optim'] model_opt['time_scale'] = env_opt['time_scale'] if model_opt['mode'] in ['baselinewtheta', 'phasewtheta']: model_opt['theta_space_mode'] = env_opt['theta_space_mode'] model_opt['theta_sz'] = env_opt['theta_sz'] elif model_opt['mode'] in ['baseline_lowlevel', 'phase_lowlevel']: model_opt['theta_space_mode'] = env_opt['theta_space_mode'] # Check asserts assert (model_opt['mode'] in [ 'baseline', 'baseline_reverse', 'phasesimple', 'phasewstate', 'baselinewtheta', 'phasewtheta', 'baseline_lowlevel', 'phase_lowlevel', 'interpolate', 'cyclic', 'maze_baseline', 'maze_baseline_wphase' ]) assert (args.algo in ['a2c', 'ppo', 'acktr']) if model_opt['recurrent_policy']: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' # Set seed - just make the seed the trial number seed = args.trial torch.manual_seed(seed) if args.cuda: torch.cuda.manual_seed(seed) # Initialization num_updates = int(optim_opt['num_frames'] ) // alg_opt['num_steps'] // alg_opt['num_processes'] torch.set_num_threads(1) # Print warning print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") # Set logging / load previous checkpoint logpath = os.path.join(log_opt['log_base'], model_opt['mode'], log_opt['exp_name'], args.algo, args.env_name, 'trial%d' % args.trial) if len(args.resume) > 0: assert (os.path.isfile(os.path.join(logpath, args.resume))) ckpt = torch.load(os.path.join(logpath, 'ckpt.pth.tar')) start_update = ckpt['update_count'] else: # Make directory, check before overwriting if os.path.isdir(logpath): if click.confirm( 'Logs directory already exists in {}. Erase?'.format( logpath, default=False)): os.system('rm -rf ' + logpath) else: return os.system('mkdir -p ' + logpath) start_update = 0 # Save options and args with open(os.path.join(logpath, os.path.basename(args.path_opt)), 'w') as f: yaml.dump(options, f, default_flow_style=False) with open(os.path.join(logpath, 'args.yaml'), 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Save git info as well os.system('git status > %s' % os.path.join(logpath, 'git_status.txt')) os.system('git diff > %s' % os.path.join(logpath, 'git_diff.txt')) os.system('git show > %s' % os.path.join(logpath, 'git_show.txt')) # Set up plotting dashboard dashboard = Dashboard(options, vis_options, logpath, vis=args.vis, port=args.port) # If interpolate mode, choose states if options['model']['mode'] == 'phase_lowlevel' and options['env'][ 'theta_space_mode'] == 'pretrain_interp': all_states = torch.load(env_opt['saved_state_file']) s1 = random.choice(all_states) s2 = random.choice(all_states) fixed_states = [s1, s2] elif model_opt['mode'] == 'interpolate': all_states = torch.load(env_opt['saved_state_file']) s1 = all_states[env_opt['s1_ind']] s2 = all_states[env_opt['s2_ind']] fixed_states = [s1, s2] else: fixed_states = None # Create environments dummy_env = make_env(args.env_name, seed, 0, logpath, options, args.verbose) dummy_env = dummy_env() envs = [ make_env(args.env_name, seed, i, logpath, options, args.verbose, fixed_states) for i in range(alg_opt['num_processes']) ] if alg_opt['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Get theta_sz for models (if applicable) dummy_env.reset() if model_opt['mode'] == 'baseline_lowlevel': model_opt['theta_sz'] = dummy_env.env.theta_sz elif model_opt['mode'] == 'phase_lowlevel': model_opt['theta_sz'] = dummy_env.env.env.theta_sz if 'theta_sz' in model_opt: env_opt['theta_sz'] = model_opt['theta_sz'] # Get observation shape obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * env_opt['num_stack'], *obs_shape[1:]) # Do vec normalize, but mask out what we don't want altered if len(envs.observation_space.shape) == 1: ignore_mask = np.zeros(envs.observation_space.shape) if env_opt['add_timestep']: ignore_mask[-1] = 1 if model_opt['mode'] in [ 'baselinewtheta', 'phasewtheta', 'baseline_lowlevel', 'phase_lowlevel' ]: theta_sz = env_opt['theta_sz'] if env_opt['add_timestep']: ignore_mask[-(theta_sz + 1):] = 1 else: ignore_mask[-theta_sz:] = 1 if args.finetune_baseline: ignore_mask = dummy_env.unwrapped._get_obs_mask() freeze_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() if env_opt['add_timestep']: ignore_mask = np.concatenate([ignore_mask, [1]]) freeze_mask = np.concatenate([freeze_mask, [0]]) ignore_mask = (ignore_mask + freeze_mask > 0).astype(float) envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) else: envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=env_opt['add_timestep'], noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) # Set up algo monitoring alg_filename = os.path.join(logpath, 'Alg.Monitor.csv') alg_f = open(alg_filename, "wt") alg_f.write('# Alg Logging %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] alg_logger = csv.DictWriter(alg_f, fieldnames=alg_fields) alg_logger.writeheader() alg_f.flush() # Create the policy network actor_critic = Policy(obs_shape, envs.action_space, model_opt) if args.cuda: actor_critic.cuda() # Create the agent if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, alg_opt['clip_param'], alg_opt['ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) rollouts = RolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(alg_opt['num_processes'], *obs_shape) # Update agent with loaded checkpoint if len(args.resume) > 0: # This should update both the policy network and the optimizer agent.load_state_dict(ckpt['agent']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] elif len(args.other_resume) > 0: ckpt = torch.load(args.other_resume) # This should update both the policy network agent.actor_critic.load_state_dict(ckpt['agent']['model']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] elif args.finetune_baseline: # Load the model based on the trial number ckpt_base = options['lowlevel']['ckpt'] ckpt_file = ckpt_base + '/trial%d/ckpt.pth.tar' % args.trial ckpt = torch.load(ckpt_file) # Make "input mask" that tells the model which inputs were the same from before and should be copied oldinput_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() # This should update both the policy network agent.actor_critic.load_state_dict_special(ckpt['agent']['model'], oldinput_mask) # Set ob_rms old_rms = ckpt['ob_rms'] old_size = old_rms.mean.size if env_opt['add_timestep']: old_size -= 1 # Only copy the pro state part of it envs.ob_rms.mean[:old_size] = old_rms.mean[:old_size] envs.ob_rms.var[:old_size] = old_rms.var[:old_size] # Inline define our helper function for updating obs def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs # Reset our env and rollouts obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([alg_opt['num_processes'], 1]) final_rewards = torch.zeros([alg_opt['num_processes'], 1]) # Update loop start = time.time() for j in range(start_update, num_updates): for step in range(alg_opt['num_steps']): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs obs, reward, done, info = envs.step(cpu_actions) #pdb.set_trace() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) # Update model and rollouts with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # Add algo updates here alg_info = {} alg_info['value_loss'] = value_loss alg_info['action_loss'] = action_loss alg_info['dist_entropy'] = dist_entropy alg_logger.writerow(alg_info) alg_f.flush() # Save checkpoints total_num_steps = (j + 1) * alg_opt['num_processes'] * alg_opt['num_steps'] #save_interval = log_opt['save_interval'] * alg_opt['log_mult'] save_interval = 100 if j % save_interval == 0: # Save all of our important information save_checkpoint(logpath, agent, envs, j, total_num_steps, args.save_every, final=False) # Print log log_interval = log_opt['log_interval'] * alg_opt['log_mult'] if j % log_interval == 0: end = time.time() print( "{}: Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(options['logs']['exp_name'], j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) # Do dashboard logging vis_interval = log_opt['vis_interval'] * alg_opt['log_mult'] if args.vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs dashboard.visdom_plot() except IOError: pass # Save final checkpoint save_checkpoint(logpath, agent, envs, j, total_num_steps, args.save_every, final=False) # Close logging file alg_f.close()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") print(args) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) for gamma in args.gamma: with open(args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv', "wt") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow([ 'update', 'error', str(int(args.num_frames) // args.num_steps) ]) os.environ['OMP_NUM_THREADS'] = '1' print("Using env {}".format(args.env_name)) envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) num_heads = len( args.gamma) if not args.reward_predictor else len(args.gamma) - 1 if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, num_heads=num_heads, hidden_size=args.hidden_size) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space, num_heads=num_heads, reward_predictor=args.reward_predictor, use_s=args.use_s, use_s_a=args.use_s_a, use_s_a_sprime=args.use_s_a_sprime) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() lrs = [args.lr] * len(actor_critic.param_groups) if not args.reward_predictor: assert len(actor_critic.param_groups) == len(lrs) model_params = [{ 'params': model_p, 'lr': args.lr } for model_p, lr in zip(actor_critic.param_groups, lrs)] else: model_params = [{ 'params': model_p, 'lr': p_lr } for model_p, p_lr in zip(actor_critic.param_groups[:-1], lrs)] model_params.append({ 'params': actor_critic.param_groups[-1], 'lr': args.lr_rp }) if args.algo == 'a2c': optimizer = optim.RMSprop(model_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(model_params, args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, gamma=args.gamma, use_rp=args.reward_predictor) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs, obs_tensor): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: obs_tensor[:, :-shape_dim0] = obs_tensor[:, shape_dim0:] obs_tensor[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs, current_obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) advantages_list = [] if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() cpu_actions = add_gaussian_noise(cpu_actions, args.action_noise) # Obser reward and next obs obs, raw_reward, done, info = envs.step(cpu_actions) reward = np.copy(raw_reward) reward = add_gaussian_noise(reward, args.reward_noise) reward = epsilon_greedy(reward, args.reward_epsilon, args.reward_high, args.reward_low) raw_reward = torch.from_numpy( np.expand_dims(np.stack(raw_reward), 1)).float() episode_rewards += raw_reward reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs) if args.reward_predictor: r_hat = actor_critic.predict_reward( Variable(rollouts.observations[step], volatile=True), action, Variable(current_obs, volatile=True)) p_hat = min(args.rp_burn_in, j) / args.rp_burn_in estimate_reward = (1 - p_hat) * reward + p_hat * r_hat.data.cpu() reward = torch.cat([reward, estimate_reward], dim=-1) value = torch.cat([r_hat, value], dim=-1).data else: value = value.data rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value, reward, masks, raw_reward) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data if args.reward_predictor: if args.use_s or args.use_s_a: r_hat = actor_critic.predict_reward( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.actions[-1], volatile=True), None).data next_value = torch.cat([r_hat, next_value], dim=-1) else: next_value = torch.cat([ torch.zeros(list(next_value.size())[:-1] + [1]), next_value ], dim=-1) rollouts.compute_returns(next_value, args.use_gae, args.tau) if args.algo in ['a2c']: batch_states = Variable(rollouts.states[0].view( -1, actor_critic.state_size)) batch_masks = Variable(rollouts.masks[:-1].view(-1, 1)) batch_obs = Variable(rollouts.observations[:-1].view( -1, *obs_shape)) batch_actions = Variable(rollouts.actions.view(-1, action_shape)) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( batch_obs, batch_states, batch_masks, batch_actions) if args.reward_predictor: batch_obs_prime = Variable(rollouts.observations[1:].view( -1, *obs_shape)) values = torch.cat([ actor_critic.predict_reward(batch_obs, batch_actions, batch_obs_prime), values ], dim=-1) returns_as_variable = Variable(rollouts.returns[:-1]) batched_v_loss = 0 values = values.view(returns_as_variable.size()) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = returns_as_variable - values value_loss = advantages.pow(2).sum(-1).mean() action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) * action_log_probs).mean() if args.reward_predictor: rp_error = (values[:, :, 0].data - rollouts.raw_rewards).pow(2).mean() advantages_list.append([ rp_error, advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0] ]) else: advantages_list.append( advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0]) optimizer.zero_grad() (batched_v_loss + value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = advantages[:, :, -1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ, observations_batch_prime, true_rewards_batch, \ noisy_observations_batch, true_observations_batch = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) if args.reward_predictor: values = torch.cat([ actor_critic.predict_reward( Variable(observations_batch), Variable(actions_batch), Variable(observations_batch_prime)), values ], dim=-1) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) td = (Variable(return_batch) - values).pow(2) value_loss = td.sum(-1).mean() if args.reward_predictor: rp_error = (values[:, 0].data - true_rewards_batch).pow(2).mean() advantages_list.append( [rp_error, td[:, -1].mean(0).data.cpu().numpy()]) else: advantages_list.append( td[:, -1].mean(0).data.cpu().numpy()) optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, " "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if len(advantages_list) > 2: advantages_array = np.array(advantages_list).reshape( -1, len(args.gamma)).T for g, gamma in enumerate(args.gamma): with open( args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv', "a") as monitor_file: monitor = csv.writer(monitor_file) monitor.writerow( [total_num_steps, np.mean(advantages_array[g])]) advantages_list = []
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) fdsafasd # if args.vis: # from visdom import Visdom # viz = Visdom() # win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) # print('here3') # fdasf obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) # elif args.algo == 'ppo': # optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) # elif args.algo == 'acktr': # optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) #set the first state to current state # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() # if args.algo == 'ppo': # old_model = copy.deepcopy(actor_critic) start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) # make prediction using state that you put into rollouts cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) # print (state.shape) # [nProcesss, ndims, height, width] # fsdf reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) # insert all that info into current step # not exactly why next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() # if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # # Sampled fisher, see Martens 2014 # actor_critic.zero_grad() # pg_fisher_loss = -action_log_probs.mean() # value_noise = Variable(torch.randn(values.size())) # if args.cuda: # value_noise = value_noise.cuda() # sample_values = values + value_noise # vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() # fisher_loss = pg_fisher_loss + vf_fisher_loss # optimizer.acc_stats = True # fisher_loss.backward(retain_graph=True) # optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() # if args.algo == 'a2c': # nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() # elif args.algo == 'ppo': # advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) # old_model.load_state_dict(actor_critic.state_dict()) # if hasattr(actor_critic, 'obs_filter'): # old_model.obs_filter = actor_critic.obs_filter # for _ in range(args.ppo_epoch): # sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) # for indices in sampler: # indices = torch.LongTensor(indices) # if args.cuda: # indices = indices.cuda() # states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices] # actions_batch = rollouts.actions.view(-1, action_shape)[indices] # return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # # Reshape to do in a single forward pass for all steps # values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch)) # _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) # ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) # adv_targ = Variable(advantages.view(-1, 1)[indices]) # surr1 = ratio * adv_targ # surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ # action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) # value_loss = (Variable(return_batch) - values).pow(2).mean() # optimizer.zero_grad() # (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() # optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def main(): print("###############################################################") print("#################### VIZDOOM LEARNER START ####################") print("###############################################################") save_path = os.path.join(args.save_dir, "a2c") num_updates = int(args.num_frames) // args.num_steps // args.num_processes reward_name = "" if args.roe: reward_name = "_event" scenario_name = args.config_path.split("/")[1].split(".")[0] print("############### " + scenario_name + " ###############") log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".log" log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".eventlog" log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".eventrewardlog" start_updates = 0 start_step = 0 best_final_rewards = -1000000.0 os.environ['OMP_NUM_THREADS'] = '1' global envs es = [ make_env(i, args.config_path, visual=args.visual, bots=args.bots) for i in range(args.num_processes) ] envs = VecEnv([es[i] for i in range(args.num_processes)]) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.resume: actor_critic = torch.load( os.path.join(save_path, log_file_name + ".pt")) filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0] if args.roe: e # TODO: Load event buffer with open(filename) as file: lines = file.readlines() start_updates = (int)(lines[-1].strip().split(",")[0]) start_steps = (int)(lines[-1].strip().split(",")[1]) num_updates += start_updates else: if not args.debug: try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, log_file_name)) for f in files: os.remove(f) with open(log_file_name, "w") as myfile: myfile.write("") files = glob.glob( os.path.join(args.log_dir, log_event_file_name)) for f in files: os.remove(f) with open(log_event_file_name, "w") as myfile: myfile.write("") files = glob.glob( os.path.join(args.log_dir, log_event_reward_file_name)) for f in files: os.remove(f) with open(log_event_reward_file_name, "w") as myfile: myfile.write("") actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) last_game_vars = [] for i in range(args.num_processes): last_game_vars.append(np.zeros(args.num_events)) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_intrinsic_rewards = torch.zeros([args.num_processes, 1]) final_intrinsic_rewards = torch.zeros([args.num_processes, 1]) episode_events = torch.zeros([args.num_processes, args.num_events]) final_events = torch.zeros([args.num_processes, args.num_events]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # Create event buffer if args.qd: event_buffer = EventBufferSQLProxy(args.num_events, args.capacity, args.exp_id, args.agent_id) elif not args.resume: event_buffer = EventBuffer(args.num_events, args.capacity) else: event_buffer = pickle.load( open(log_file_name + "_event_buffer_temp.p", "rb")) event_episode_rewards = [] start = time.time() for j in np.arange(start_updates, num_updates): for step in range(args.num_steps): value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info, events = envs.step(cpu_actions) intrinsic_reward = [] # Fix broken rewards - upscale for i in range(len(reward)): if scenario_name in ["deathmatch", "my_way_home"]: reward[i] *= 100 if scenario_name == "deadly_corridor": reward[i] = 1 if events[i][2] >= 1 else 0 for e in events: if args.roe: intrinsic_reward.append(event_buffer.intrinsic_reward(e)) else: r = reward[len(intrinsic_reward)] intrinsic_reward.append(r) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() intrinsic_reward = torch.from_numpy( np.expand_dims(np.stack(intrinsic_reward), 1)).float() #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float() events = torch.from_numpy(events).float() episode_rewards += reward episode_intrinsic_rewards += intrinsic_reward episode_events += events # Event stats event_rewards = [] for ei in range(0, args.num_events): ev = np.zeros(args.num_events) ev[ei] = 1 er = event_buffer.intrinsic_reward(ev) event_rewards.append(er) event_episode_rewards.append(event_rewards) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_intrinsic_rewards *= masks final_events *= masks final_rewards += (1 - masks) * episode_rewards final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards final_events += (1 - masks) * episode_events for i in range(args.num_processes): if done[i]: event_buffer.record_events(np.copy( final_events[i].numpy()), frame=j * args.num_steps) episode_rewards *= masks episode_intrinsic_rewards *= masks episode_events *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, intrinsic_reward, masks) final_episode_reward = np.mean(event_episode_rewards, axis=0) event_episode_rewards = [] next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if final_rewards.mean() > best_final_rewards and not args.debug: try: os.makedirs(save_path) except OSError: pass best_final_rewards = final_rewards.mean() save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save( save_model, os.path.join(save_path, log_file_name.split(".log")[0] + ".pt")) if j % args.save_interval == 0 and args.save_dir != "" and not args.debug: try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, log_file_name + "_temp.pt")) if isinstance(event_buffer, EventBuffer): pickle.dump(event_buffer, open(log_file_name + "_event_buffer_temp.p", "wb")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.max(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.max() ) log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \ .format(j, total_num_steps, final_rewards.mean(), final_rewards.std(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.std()) log_to_event_file = ','.join( map(str, event_buffer.get_event_mean().tolist())) + "\n" log_to_event_reward_file = ','.join( map(str, event_buffer.get_event_rewards().tolist())) + "\n" print(log) print(log_to_event_file) # Save to files with open(log_file_name, "a") as myfile: myfile.write(log_to_file) with open(log_event_file_name, "a") as myfile: myfile.write(str(total_num_steps) + "," + log_to_event_file) with open(log_event_reward_file_name, "a") as myfile: myfile.write( str(total_num_steps) + "," + log_to_event_reward_file) envs.close() time.sleep(5)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' # logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder) # logger.save_args(args) # print ("---------------------------------------") # print ('Saving to', logger.save_folder) # print ("---------------------------------------") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) target_actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) target_actor_critic = MLPPolicy(obs_shape[0], envs.action_space) for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()): target_param.data.copy_(param.data) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_regularizer_criterion = nn.KLDivLoss() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) """ Used for KL Constraint in case of Continuous Action Stochastic Policies """ # target_values, target_action_log_probs, target_dist_entropy, target_states, target_action_mean, target_action_std = target_actor_critic.evaluate_actions_mean_and_std(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), # Variable(rollouts.states[0].view(-1, actor_critic.state_size)), # Variable(rollouts.masks[:-1].view(-1, 1)), # Variable(rollouts.actions.view(-1, action_shape))) # actor_regularizer_loss = (torch.log(action_std/target_action_std) + (action_std.pow(2) + (action_mean - target_action_mean).pow(2))/(2*target_action_std.pow(2)) - 0.5) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() ### Loss with regularizer added ##action_loss = -(Variable(advantages.data) * action_log_probs).mean() + args.actor_lambda * actor_regularizer_loss.mean(0).sum() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() total_loss = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef total_loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() ## Exponential average for target updates #if (j%args.target_update_interval == 0): # for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()): # target_param.data.copy_(args.target_tau * param.data + (1 - args.target_tau) * target_param.data) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) final_rewards_mean = [final_rewards.mean()] final_rewards_median = [final_rewards.median()] final_rewards_min = [final_rewards.min()] final_rewards_max = [final_rewards.max()] # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max) # logger.save() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def train_a_gym_model(env, config): """We train gym-type RL problem using ppo given environment and configuration""" torch.set_num_threads(1) seed = config.get('seed', 1) log_dir = config.get('log_dir', '/tmp/gym') log_interval = config.get('log_interval', 10) save_interval = config.get('save_interval', 100) save_dir = config.get('save_dir', 'trained_models/ppo') add_timestep = config.get('add_timestep', False) num_processes = config.get('num_processes', 4) gamma = config.get('gamma', 0.99) num_stack = config.get('num_stack', 1) recurrent_policy = config.get('recurrent_policy', False) cuda = config.get('cuda', True) vis = config.get('vis', True) vis_interval = config.get('vis_interval', 100) env_name = config['env_name'] save_step = config.get('save_step', None) if save_step is not None: next_save_step = save_step # clean the log folder, if necessary try: os.makedirs(log_dir) except OSError: files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) for f in files: os.remove(f) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed(seed) if vis: from visdom import Visdom port = config.get('port', 8097) viz = Visdom(port=port) win = None envs = [make_env(env, seed, i, log_dir, add_timestep) for i in range(num_processes)] if num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if cuda: actor_critic.cuda() clip_param = config.get('clip_param', 0.2) ppo_epoch = config.get('ppo_epoch', 4) num_mini_batch = config.get('num_mini_batch', 32) value_loss_coef = config.get('value_loss_coef', 0.5) entropy_coef = config.get('entropy_coef', 0.01) lr = config.get('lr', 1e-3) eps = config.get('eps', 1e-5) max_grad_norm = config.get('max_grad_norm', 0.5) use_gae = config.get('use_gae', False) tau = config.get('tau', 0.95) num_steps = config.get('num_steps', 100) num_frames = config.get('num_frames', 1e6) num_updates = int(num_frames) // num_steps // num_processes agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=lr, eps=eps, max_grad_norm=max_grad_norm) rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) final_rewards = torch.zeros([num_processes, 1]) if cuda: current_obs = current_obs.cuda() rollouts.cuda() def save_the_model(num=None): """num is additional information""" # save it after training save_path = save_dir try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] if num is None: save_name = '%s.pt' % env_name else: save_name = '%s_at_%d.pt' % (env_name, int(num)) torch.save(save_model, os.path.join(save_path, save_name)) start = time.time() for j in range(1, 1 + num_updates): for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, use_gae, gamma, tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % save_interval == 0 and save_dir != "": save_the_model() if save_step is not None: total_num_steps = j * num_processes * num_steps if total_num_steps > next_save_step: save_the_model(total_num_steps) next_save_step += save_step if j % log_interval == 0: end = time.time() total_num_steps = j * num_processes * num_steps print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, log_dir, env_name, 'ppo', num_frames) except IOError: pass # finally save model again save_the_model()
class goal_agent(object): def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # self.next_state_pred_ = hparams['next_state_pred_'] self.frame_encoder = CNN_Model(input_sahpe, output_shape) self.goal_predictor = CNNPolicy(self.obs_shape[0], hparams['action_space']) self.policy = NN_Model(input_shape, output_shape) self.goal_encoder = NN_model(input_shape, output_shape) # # Policy and Value network # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], hparams['action_space']) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], hparams['action_space']) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, hparams['action_space']) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.load_model is not None: actor_critic = torch.load(args.load_model)[0] else: actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy, args.hidden_size, args) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, pop_art=args.pop_art) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() scale = 1. current_pdrr = [0., 0.] last_update = 0 ### parameters for adaptive reward scaling ### t_stop = 0 beta = .99 R_prev = -1e9 m_max = -1e9 m_t = 0 reverse = False last_scale_t = -1e9 ### for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # reward *= args.reward_scaling reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs, current_obs, obs_shape, args.num_stack) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) t = j // args.adaptive_interval if args.pop_art: value_loss, action_loss, dist_entropy = agent.pop_art_update( rollouts) else: if t - last_scale_t > 100: value_loss, action_loss, dist_entropy = agent.update( rollouts, update_actor=True) else: value_loss, action_loss, dist_entropy = agent.update( rollouts, update_actor=False) if agent.max_grad_norm < .5 and t - last_scale_t < 100: agent.max_grad_norm += 0.00001 if j % args.adaptive_interval == 0 and j and t - last_scale_t > 100: t = j // args.adaptive_interval R_t = float('{}'.format(final_rewards.mean())) R_ts.append(R_t) assert type(R_t) == float t_stop += 1 m_t = beta * m_t + (1 - beta) * R_t m_hat = m_t / (1 - beta**t) print('m_hat :{}, t_stop: {}'.format(m_hat, t_stop)) print('agent.max_grad_norm, ', agent.max_grad_norm) if m_hat > m_max: m_max = m_hat t_stop = 0 if t_stop > args.tolerance: if reverse and m_max <= R_prev: break elif reverse and m_max > R_prev: agent.max_grad_norm = args.max_grad_norm_after actor_critic.rescale(args.cdec) scale *= args.cdec agent.reinitialize() last_scale_t = t elif not reverse and m_max <= R_prev: agent.max_grad_norm = args.max_grad_norm_after actor_critic.rescale(args.cdec) scale *= args.cdec agent.reinitialize() reverse = True last_scale_t = t else: agent.max_grad_norm = args.max_grad_norm_after actor_critic.rescale(args.cinc) scale *= args.cinc agent.reinitialize() last_scale_t = t R_prev = m_max j = t_stop = m_t = 0 m_max = -1e9 # if j % args.log_interval == 0: # this is used for testing saturation # relus = actor_critic.base_forward( # rollouts.observations[:-1].view(-1, *rollouts.observations.size()[2:])) rollouts.after_update() if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # relus = log_saturation(fname=args.saturation_log, # first=(j==0), # relus=[relu.cpu().detach().numpy() for relu in relus]) # print("saturation", relus) # if j > 0: # current_pdrr = incremental_update(current_pdrr, relus) print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, scale {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss, scale)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.plot_title, args.algo, args.num_frames) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") if args.render_game: mp.set_start_method('spawn') torch.set_num_threads(1) try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: if os.path.isfile(f): os.remove(f) if 'MiniPacman' in args.env_name: from environment_model.mini_pacman.builder import MiniPacmanEnvironmentBuilder builder = MiniPacmanEnvironmentBuilder(args) else: from environment_model.latent_space.builder import LatentSpaceEnvironmentBuilder builder = LatentSpaceEnvironmentBuilder(args) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None visdom_plotter = VisdomPlotterA2C(viz, args.algo == 'i2a') if 'MiniPacman' in args.env_name: from gym_envs.envs_mini_pacman import make_custom_env envs = [ make_custom_env(args.env_name, args.seed, i, args.log_dir, grey_scale=args.grey_scale) for i in range(args.num_processes) ] elif args.algo == 'i2a' or args.train_on_200x160_pixel: from gym_envs.envs_ms_pacman import make_env_ms_pacman envs = [ make_env_ms_pacman(env_id=args.env_name, seed=args.seed, rank=i, log_dir=args.log_dir, grey_scale=False, stack_frames=1, skip_frames=4) for i in range(args.num_processes) ] else: from envs import make_env envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'i2a' and 'MiniPacman' in args.env_name: actor_critic = builder.build_i2a_model(envs, args) elif args.algo == 'i2a': actor_critic = builder.build_i2a_model(envs, args) elif 'MiniPacman' in args.env_name: actor_critic = builder.build_a2c_model(envs) elif args.train_on_200x160_pixel: from a2c_models.atari_model import AtariModel actor_critic = A2C_PolicyWrapper( AtariModel(obs_shape=obs_shape, action_space=envs.action_space.n, use_cuda=args.cuda)) else: actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.load_model: load_path = os.path.join(args.save_dir, args.algo) load_path = os.path.join(load_path, args.env_name + ".pt") if os.path.isfile(load_path): # if args.cuda: saved_state = torch.load(load_path, map_location=lambda storage, loc: storage) actor_critic.load_state_dict(saved_state) else: print("Can not load model ", load_path, ". File does not exists") return log_file = os.path.join(os.path.join(args.save_dir, args.algo), args.env_name + ".log") if not os.path.exists(log_file) or not args.load_model: print("Log file: ", log_file) with open(log_file, 'w') as the_file: the_file.write('command line args: ' + " ".join(sys.argv) + '\n') if args.cuda: actor_critic.cuda() if args.render_game: load_path = os.path.join(args.save_dir, args.algo) test_process = TestPolicy(model=copy.deepcopy(actor_critic), load_path=load_path, args=args) if args.algo == 'i2a': agent = I2A_ALGO(actor_critic=actor_critic, obs_shape=obs_shape, action_shape=action_shape, args=args) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.algo == 'i2a': rollouts = I2A_RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) else: rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): if args.algo == 'i2a': # Sample actions value, action, action_log_prob, states, policy_action_prob, rollout_action_prob = actor_critic.act( rollouts.observations[step].clone(), rollouts.states[step], rollouts.masks[step]) else: # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) if args.algo == "i2a": rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks, policy_action_prob, rollout_action_prob) else: rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo == 'i2a': value_loss, action_loss, dist_entropy, distill_loss = agent.update( rollouts=rollouts) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if args.vis: distill_loss_data = distill_loss if args.algo == 'i2a' else None visdom_plotter.append(dist_entropy, final_rewards.numpy().flatten(), value_loss, action_loss, distill_loss_data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic torch.save(save_model.state_dict(), os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps reward_info = "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"\ .format(final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max()) distill_loss = ", distill_loss {:.5f}".format( distill_loss) if args.algo == 'i2a' else "" loss_info = "value loss {:.5f}, policy loss {:.5f}{}"\ .format(value_loss, action_loss, distill_loss) entropy_info = "entropy {:.5f}".format(dist_entropy) info = "Updates {}, num timesteps {}, FPS {}, {}, {}, {}, time {:.5f} min"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), reward_info, entropy_info, loss_info, (end - start) / 60.) with open(log_file, 'a') as the_file: the_file.write(info + '\n') print(info) if args.vis and j % args.vis_interval == 0: frames = j * args.num_processes * args.num_steps visdom_plotter.plot(frames)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() viz_1 = Visdom() win = None win1 = None env_name_1 = 'HalfCheetahSmallFoot-v0' args.env_name = 'HalfCheetahSmallLeg-v0' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] envs_1 = [ make_env(env_name_1, args.seed, i, args.log_dir_1) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) envs_1 = SubprocVecEnv(envs_1) else: envs = DummyVecEnv(envs) envs_1 = DummyVecEnv(envs_1) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) envs_1 = VecNormalize(envs_1) #same for both tasks obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = MLPPolicy(obs_shape[0], envs.action_space) actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space) #same for both tasks action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() actor_critic_1.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) optimizer_1 = optim.RMSprop(actor_critic_1.parameters(), args.lr, eps=args.eps, alpha=args.alpha) #Different for both tasks rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs_1.action_space, actor_critic_1.state_size) current_obs_1 = torch.zeros(args.num_processes, *obs_shape) #Different update functions def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def update_current_obs_1(obs): shape_dim0 = envs_1.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:] current_obs_1[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) obs_1 = envs_1.reset() update_current_obs_1(obs_1) rollouts.observations[0].copy_(current_obs) rollouts_1.observations[0].copy_(current_obs_1) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_rewards_1 = torch.zeros([args.num_processes, 1]) final_rewards_1 = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() current_obs_1 = current_obs_1.cuda() rollouts_1.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions from branch 1 value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) #Sample actions from branch 2 value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act( Variable(rollouts_1.observations[step], volatile=True), Variable(rollouts_1.states[step], volatile=True), Variable(rollouts_1.masks[step], volatile=True)) cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy() obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1) reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1), 1)).float() episode_rewards_1 += reward_1 masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done_1]) final_rewards_1 *= masks_1 final_rewards_1 += (1 - masks_1) * episode_rewards_1 episode_rewards_1 *= masks_1 if args.cuda: masks_1 = masks_1.cuda() if current_obs_1.dim() == 4: current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2) else: current_obs_1 *= masks_1 update_current_obs_1(obs_1) rollouts_1.insert(step, current_obs_1, states_1.data, action_1.data, action_log_prob_1.data, value_1.data, reward_1, masks_1) #Update for branch 1 next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() #share params branch 1 -> branch 2 actor_critic_1.a_fc1.weight.data = copy.deepcopy( actor_critic.a_fc1.weight.data) actor_critic_1.a_fc1.bias.data = copy.deepcopy( actor_critic.a_fc1.bias.data) actor_critic_1.v_fc1.weight.data = copy.deepcopy( actor_critic.v_fc1.weight.data) actor_critic_1.v_fc1.bias.data = copy.deepcopy( actor_critic.v_fc1.bias.data) #Update for branch 2 next_value_1 = actor_critic_1( Variable(rollouts_1.observations[-1], volatile=True), Variable(rollouts_1.states[-1], volatile=True), Variable(rollouts_1.masks[-1], volatile=True))[0].data rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma, args.tau) values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions( Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)), Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)), Variable(rollouts_1.masks[:-1].view(-1, 1)), Variable(rollouts_1.actions.view(-1, action_shape))) values_1 = values_1.view(args.num_steps, args.num_processes, 1) action_log_probs_1 = action_log_probs_1.view(args.num_steps, args.num_processes, 1) advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1 value_loss_1 = advantages_1.pow(2).mean() action_loss_1 = -(Variable(advantages_1.data) * action_log_probs_1).mean() optimizer_1.zero_grad() (value_loss_1 * args.value_loss_coef + action_loss_1 - dist_entropy_1 * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic_1.parameters(), args.max_grad_norm) optimizer_1.step() rollouts_1.after_update() #share params branch 2 -> branch 1 actor_critic.a_fc1.weight.data = copy.deepcopy( actor_critic_1.a_fc1.weight.data) actor_critic.a_fc1.bias.data = copy.deepcopy( actor_critic_1.a_fc1.bias.data) actor_critic.v_fc1.weight.data = copy.deepcopy( actor_critic_1.v_fc1.weight.data) actor_critic.v_fc1.bias.data = copy.deepcopy( actor_critic_1.v_fc1.bias.data) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo, args.env_name + '_' + env_name_1) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic save_model = actor_critic_1 if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model_1 = copy.deepcopy(actor_critic_1).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] save_model_1 = [ save_model_1, hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) torch.save(save_model_1, os.path.join(save_path, env_name_1 + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) print( "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards_1.mean(), final_rewards_1.median(), final_rewards_1.min(), final_rewards_1.max(), dist_entropy_1.data[0], value_loss_1.data[0], action_loss_1.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1, args.algo) except IOError: pass
def main(): print("#######") print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ['CUDA_VISIBLE_DEVICES'] = "9" if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.use_cell: hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1) ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1) else: hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1) if args.cuda: actor_critic=actor_critic.cuda() hs = hs.cuda() ft = ft.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, num_processes=args.num_processes, num_steps=args.num_steps, use_cell=args.use_cell, lenhs=args.lenhs,lenft=args.lenft, plan=args.plan, ac_intv=args.ac_interval, hs_intv=args.hs_interval, ft_intv=args.ft_interval ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, feat_size=512) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() rec_x = [] rec_y = [] file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w') hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda() hs_ind = torch.IntTensor(args.num_processes, 1).zero_() epinfobuf = deque(maxlen=100) start_time = time.time() for j in range(num_updates): print('begin sample, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) for step in range(args.num_steps): # Sample actions with torch.no_grad(): rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start_ind = max(hs_ind[i],step+1-args.lenhs) for ind in range(start_ind,step+1): h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c) hs_info[i,:]=h.view(1,2*actor_critic.hid_size) del h,c gc.collect() else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i]) hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info) value, action, action_log_prob, states = actor_critic.act( hidden_feat, rollouts.states[step]) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, infos = envs.step(cpu_actions) for info in infos: maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo['r']]) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks) with torch.no_grad(): rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1]) if args.use_cell: for i in range(args.num_processes): h = torch.zeros(1, 2 * actor_critic.hid_size).cuda() c = torch.zeros(1, 2 * actor_critic.hid_size).cuda() start = max(hs_ind[i], step + 1 - args.lenhs) for ind in range(start, step + 1): h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c) hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size) del h,c else: for i in range(args.num_processes): start_ind = max(hs_ind[i], step + 1 - args.lenhs) hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i]) hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info) next_value = actor_critic.get_value(hidden_feat).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) rollouts.compute_ft_ind() print('begin update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) value_loss, action_loss, dist_entropy = agent.update(rollouts) print('end update, time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)))) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps v_mean,v_median,v_min,v_max = safe(epinfobuf) print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), int(total_num_steps / (end - start_time)), v_mean, v_median, v_min, v_max, dist_entropy, value_loss, action_loss)) if not (v_mean==np.nan): rec_x.append(total_num_steps) rec_y.append(v_mean) file.write(str(total_num_steps)) file.write(' ') file.writelines(str(v_mean)) file.write('\n') if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name, args.env_name, args.num_frames) file.close()
def main(): torch.set_num_threads(1) if args.vis: summary_writer = tf.summary.FileWriter(args.save_dir) envs = [make_env(i, args=args) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1 and args.env_name not in [ 'OverCooked' ]: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs def get_onehot(num_class, action): one_hot = np.zeros(num_class) one_hot[action] = 1 one_hot = torch.from_numpy(one_hot).float() return one_hot if args.policy_type == 'shared_policy': actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True, ) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: actor_critic.load_state_dict( torch.load(args.save_dir + '/trained_learner.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 while True: if num_trained_frames > args.num_frames: break for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step], ) cpu_actions = action.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) actor_critic.save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush() elif args.policy_type == 'hierarchical_policy': num_subpolicy = args.num_subpolicy update_interval = args.hierarchy_interval while len(num_subpolicy) < args.num_hierarchy - 1: num_subpolicy.append(num_subpolicy[-1]) while len(update_interval) < args.num_hierarchy - 1: update_interval.append(update_interval[-1]) if args.num_hierarchy == 1: update_interval = [1] num_subpolicy = [envs.action_space.n] # print(envs.action_space.n) # print(stop) actor_critic = {} rollouts = {} actor_critic['top'] = EHRL_Policy(obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), 128, args.recurrent_policy, 'top') rollouts['top'] = EHRL_RolloutStorage( int(args.num_steps / update_interval[-1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[-1]), np.zeros(1), actor_critic['top'].state_size) for hie_id in range(args.num_hierarchy - 1): if hie_id > 0: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( int(args.num_steps / update_interval[hie_id - 1]), args.num_processes, obs_shape, space.Discrete(num_subpolicy[hie_id - 1]), np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) else: actor_critic[str(hie_id)] = EHRL_Policy( obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), 128, args.recurrent_policy, str(hie_id)) rollouts[str(hie_id)] = EHRL_RolloutStorage( args.num_steps, args.num_processes, obs_shape, envs.action_space, np.zeros(num_subpolicy[hie_id]), actor_critic[str(hie_id)].state_size) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: for key in actor_critic: actor_critic[key].cuda() agent = {} for ac_key in actor_critic: if args.algo == 'a2c': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'ppo': agent[ac_key] = algo.PPO( actor_critic[ac_key], args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == 'acktr': agent[ac_key] = algo.A2C_ACKTR( actor_critic[ac_key], args.value_loss_coef, args.entropy_coef, acktr=True, ) current_obs = torch.zeros(args.num_processes, *obs_shape) obs = envs.reset() update_current_obs(obs) for obs_key in rollouts: rollouts[obs_key].observations[0].copy_(current_obs) episode_reward_raw = 0.0 final_reward_raw = 0.0 if args.cuda: current_obs = current_obs.cuda() for rol_key in rollouts: rollouts[rol_key].cuda() # try to load checkpoint try: num_trained_frames = np.load(args.save_dir + '/num_trained_frames.npy')[0] try: for save_key in actor_critic: actor_critic[save_key].load_state_dict( torch.load(args.save_dir + '/trained_learner_' + save_key + '.pth')) print('Load learner previous point: Successed') except Exception as e: print('Load learner previous point: Failed') except Exception as e: num_trained_frames = 0 print('Learner has been trained to step: ' + str(num_trained_frames)) start = time.time() j = 0 onehot_mem = {} reward_mem = {} if args.num_hierarchy > 1: update_flag = np.zeros(args.num_hierarchy - 1, dtype=np.uint8) else: update_flag = np.zeros(1, dtype=np.uint8) step_count = 0 value = {} next_value = {} action = {} action_log_prob = {} states = {} while True: if num_trained_frames > args.num_frames: break step_count = 0 for step in range(args.num_steps): if step_count % update_interval[-1] == 0: with torch.no_grad(): value['top'], action['top'], action_log_prob[ 'top'], states['top'] = actor_critic['top'].act( rollouts['top'].observations[update_flag[-1]], rollouts['top'].one_hot[update_flag[-1]], rollouts['top'].states[update_flag[-1]], rollouts['top'].masks[update_flag[-1]], ) update_flag[-1] += 1 onehot_mem[str(args.num_hierarchy - 1)] = get_onehot( num_subpolicy[-1], action['top']) onehot_mem[str(args.num_hierarchy)] = get_onehot(1, 0) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[interval_id] == 0: with torch.no_grad(): value[str(interval_id+1)], action[str(interval_id+1)], action_log_prob[str(interval_id+1)], states[str(interval_id+1)] = \ actor_critic[str(interval_id+1)].act( rollouts[str(interval_id+1)].observations[update_flag[interval_id]], rollouts[str(interval_id+1)].one_hot[update_flag[-1]], rollouts[str(interval_id+1)].states[update_flag[interval_id]], rollouts[str(interval_id+1)].masks[update_flag[interval_id]], ) update_flag[interval_id] += 1 onehot_mem[str(interval_id + 1)] = get_onehot( num_subpolicy[interval_id], action[str(interval_id + 1)]) # Sample actions if args.num_hierarchy > 1: with torch.no_grad(): value['0'], action['0'], action_log_prob['0'], states[ '0'] = actor_critic['0'].act( rollouts['0'].observations[step], rollouts['0'].one_hot[step], rollouts['0'].states[step], rollouts['0'].masks[step], ) cpu_actions = action['0'].squeeze(1).cpu().numpy() else: cpu_actions = action['top'].squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward_raw, done, info = envs.step(cpu_actions) for reward_id in range(args.num_hierarchy - 1): try: reward_mem[str(reward_id)] += [reward_raw[0]] except Exception as e: reward_mem[str(reward_id)] = reward_raw[0] episode_reward_raw += reward_raw[0] if done[0]: final_reward_raw = episode_reward_raw episode_reward_raw = 0.0 reward = np.sign(reward_raw) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) if args.num_hierarchy > 1: rollouts['0'].insert(current_obs, states['0'], action['0'], onehot_mem['1'], action_log_prob['0'], value['0'], reward, masks) if step_count % update_interval[-1] == 0: if args.num_hierarchy > 1: reward_mean = np.mean( np.array(reward_mem[str(args.num_hierarchy - 2)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward_mean, masks) reward_mem[str(args.num_hierarchy - 2)] = [] else: rollouts['top'].insert( current_obs, states['top'], action['top'], onehot_mem[str(args.num_hierarchy)], action_log_prob['top'], value['top'], reward, masks) if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): if step_count % update_interval[ interval_id] == 0 or done[0]: reward_mean = np.mean( np.array(reward_mem[str(interval_id)])) reward_mean = torch.from_numpy( np.ones(1) * reward_mean).float() rollouts[str(interval_id + 1)].insert( current_obs, states[str(interval_id + 1)], action[str(interval_id + 1)], onehot_mem[str(interval_id + 2)], action_log_prob[str(interval_id + 1)], value[str(interval_id + 1)], reward_mean, masks) reward_mem[str(interval_id)] = [] step_count += 1 if args.num_hierarchy > 1: with torch.no_grad(): next_value['0'] = actor_critic['0'].get_value( rollouts['0'].observations[-1], rollouts['0'].one_hot[-1], rollouts['0'].states[-1], rollouts['0'].masks[-1], ).detach() rollouts['0'].compute_returns(next_value['0'], args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent['0'].update( rollouts['0'], add_onehot=True) rollouts['0'].after_update() with torch.no_grad(): next_value['top'] = actor_critic['top'].get_value( rollouts['top'].observations[-1], rollouts['top'].one_hot[-1], rollouts['top'].states[-1], rollouts['top'].masks[-1], ).detach() rollouts['top'].compute_returns(next_value['top'], args.use_gae, args.gamma, args.tau) if args.num_hierarchy > 1: _, _, _ = agent['top'].update(rollouts['top'], add_onehot=True) else: value_loss, action_loss, dist_entropy = agent['top'].update( rollouts['top'], add_onehot=True) rollouts['top'].after_update() update_flag[-1] = 0 if len(update_interval) > 1: for interval_id in range(len(update_interval) - 1): with torch.no_grad(): next_value[str(interval_id + 1)] = actor_critic[str( interval_id + 1)].get_value( rollouts[str(interval_id + 1)].observations[-1], rollouts[str(interval_id + 1)].one_hot[-1], rollouts[str(interval_id + 1)].states[-1], rollouts[str(interval_id + 1)].masks[-1], ).detach() rollouts[str(interval_id + 1)].compute_returns( next_value[str(interval_id + 1)], args.use_gae, args.gamma, args.tau) _, _, _ = agent[str(interval_id + 1)].update( rollouts[str(interval_id + 1)], add_onehot=True) rollouts[str(interval_id + 1)].after_update() update_flag[interval_id] = 0 num_trained_frames += (args.num_steps * args.num_processes) j += 1 # save checkpoint if j % args.save_interval == 0 and args.save_dir != "": try: np.save( args.save_dir + '/num_trained_frames.npy', np.array([num_trained_frames]), ) for key_store in actor_critic: actor_critic[key].save_model(save_path=args.save_dir) except Exception as e: print("Save checkpoint failed") # print info if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "[{}/{}], FPS {}, final_reward_raw {:.2f}, remaining {} hours" .format( num_trained_frames, args.num_frames, int(num_trained_frames / (end - start)), final_reward_raw, (end - start) / num_trained_frames * (args.num_frames - num_trained_frames) / 60.0 / 60.0)) # visualize results if args.vis and j % args.vis_interval == 0: '''we use tensorboard since its better when comparing plots''' summary = tf.Summary() summary.value.add( tag='final_reward_raw', simple_value=final_reward_raw, ) summary.value.add( tag='value_loss', simple_value=value_loss, ) summary.value.add( tag='action_loss', simple_value=action_loss, ) summary.value.add( tag='dist_entropy', simple_value=dist_entropy, ) summary_writer.add_summary(summary, num_trained_frames) summary_writer.flush()
def main(): global args args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() args.vis = not args.no_vis # Set options if args.path_opt is not None: with open(args.path_opt, 'r') as handle: options = yaml.load(handle) if args.vis_path_opt is not None: with open(args.vis_path_opt, 'r') as handle: vis_options = yaml.load(handle) print('## args') pprint(vars(args)) print('## options') pprint(options) # Load the lowlevel opt and lowlevel_optfile = options['lowlevel']['optfile'] with open(lowlevel_optfile, 'r') as handle: ll_opt = yaml.load(handle) # Whether we should set ll policy to be deterministic or not ll_deterministic = options['lowlevel']['deterministic'] # Put alg_%s and optim_%s to alg and optim depending on commandline options['use_cuda'] = args.cuda options['trial'] = args.trial options['alg'] = options['alg_%s' % args.algo] options['optim'] = options['optim_%s' % args.algo] alg_opt = options['alg'] alg_opt['algo'] = args.algo model_opt = options['model'] env_opt = options['env'] env_opt['env-name'] = args.env_name log_opt = options['logs'] optim_opt = options['optim'] options[ 'lowlevel_opt'] = ll_opt # Save low level options in option file (for logging purposes) # Pass necessary values in ll_opt assert (ll_opt['model']['mode'] in ['baseline_lowlevel', 'phase_lowlevel']) ll_opt['model']['theta_space_mode'] = ll_opt['env']['theta_space_mode'] ll_opt['model']['time_scale'] = ll_opt['env']['time_scale'] # If in many module mode, load the lowlevel policies we want if model_opt['mode'] == 'hierarchical_many': # Check asserts theta_obs_mode = ll_opt['env']['theta_obs_mode'] theta_space_mode = ll_opt['env']['theta_space_mode'] assert (theta_space_mode in [ 'pretrain_interp', 'pretrain_any', 'pretrain_any_far', 'pretrain_any_fromstart' ]) assert (theta_obs_mode == 'pretrain') # Get the theta size theta_sz = options['lowlevel']['num_load'] ckpt_base = options['lowlevel']['ckpt'] # Load checkpoints lowlevel_ckpts = [] for ll_ind in range(theta_sz): if args.change_ll_offset: ll_offset = theta_sz * args.trial else: ll_offset = 0 lowlevel_ckpt_file = ckpt_base + '/trial%d/ckpt.pth.tar' % ( ll_ind + ll_offset) assert (os.path.isfile(lowlevel_ckpt_file)) lowlevel_ckpts.append(torch.load(lowlevel_ckpt_file)) # Otherwise it's one ll polciy to load else: # Get theta_sz for low level model theta_obs_mode = ll_opt['env']['theta_obs_mode'] theta_space_mode = ll_opt['env']['theta_space_mode'] assert (theta_obs_mode in ['ind', 'vector']) if theta_obs_mode == 'ind': if theta_space_mode == 'forward': theta_sz = 1 elif theta_space_mode == 'simple_four': theta_sz = 4 elif theta_space_mode == 'simple_eight': theta_sz = 8 elif theta_space_mode == 'k_theta': theta_sz = ll_opt['env']['num_theta'] elif theta_obs_mode == 'vector': theta_sz = 2 else: raise NotImplementedError else: raise NotImplementedError ll_opt['model']['theta_sz'] = theta_sz ll_opt['env']['theta_sz'] = theta_sz # Load the low level policy params lowlevel_ckpt = options['lowlevel']['ckpt'] assert (os.path.isfile(lowlevel_ckpt)) lowlevel_ckpt = torch.load(lowlevel_ckpt) hl_action_space = spaces.Discrete(theta_sz) # Check asserts assert (args.algo in ['a2c', 'ppo', 'acktr', 'dqn']) assert (optim_opt['hierarchical_mode'] in ['train_highlevel', 'train_both']) if model_opt['recurrent_policy']: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' assert (model_opt['mode'] in ['hierarchical', 'hierarchical_many']) # Set seed - just make the seed the trial number seed = args.trial + 1000 # Make it different than lowlevel seed torch.manual_seed(seed) if args.cuda: torch.cuda.manual_seed(seed) # Initialization num_updates = int(optim_opt['num_frames']) // alg_opt[ 'num_steps'] // alg_opt['num_processes'] // optim_opt['num_ll_steps'] torch.set_num_threads(1) # Print warning print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") # Set logging / load previous checkpoint logpath = os.path.join(log_opt['log_base'], model_opt['mode'], log_opt['exp_name'], args.algo, args.env_name, 'trial%d' % args.trial) if len(args.resume) > 0: assert (os.path.isfile(os.path.join(logpath, args.resume))) ckpt = torch.load(os.path.join(logpath, 'ckpt.pth.tar')) start_update = ckpt['update_count'] else: # Make directory, check before overwriting if os.path.isdir(logpath): if click.confirm( 'Logs directory already exists in {}. Erase?'.format( logpath, default=False)): os.system('rm -rf ' + logpath) else: return os.system('mkdir -p ' + logpath) start_update = 0 # Save options and args with open(os.path.join(logpath, os.path.basename(args.path_opt)), 'w') as f: yaml.dump(options, f, default_flow_style=False) with open(os.path.join(logpath, 'args.yaml'), 'w') as f: yaml.dump(vars(args), f, default_flow_style=False) # Save git info as well os.system('git status > %s' % os.path.join(logpath, 'git_status.txt')) os.system('git diff > %s' % os.path.join(logpath, 'git_diff.txt')) os.system('git show > %s' % os.path.join(logpath, 'git_show.txt')) # Set up plotting dashboard dashboard = Dashboard(options, vis_options, logpath, vis=args.vis, port=args.port) # Create environments envs = [ make_env(args.env_name, seed, i, logpath, options, args.verbose) for i in range(alg_opt['num_processes']) ] if alg_opt['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Check if we use timestep in low level if 'baseline' in ll_opt['model']['mode']: add_timestep = False elif 'phase' in ll_opt['model']['mode']: add_timestep = True else: raise NotImplementedError # Get shapes dummy_env = make_env(args.env_name, seed, 0, logpath, options, args.verbose) dummy_env = dummy_env() s_pro_dummy = dummy_env.unwrapped._get_pro_obs() s_ext_dummy = dummy_env.unwrapped._get_ext_obs() if add_timestep: ll_obs_shape = (s_pro_dummy.shape[0] + theta_sz + 1, ) ll_raw_obs_shape = (s_pro_dummy.shape[0] + 1, ) else: ll_obs_shape = (s_pro_dummy.shape[0] + theta_sz, ) ll_raw_obs_shape = (s_pro_dummy.shape[0], ) ll_obs_shape = (ll_obs_shape[0] * env_opt['num_stack'], *ll_obs_shape[1:]) hl_obs_shape = (s_ext_dummy.shape[0], ) hl_obs_shape = (hl_obs_shape[0] * env_opt['num_stack'], *hl_obs_shape[1:]) # Do vec normalize, but mask out what we don't want altered # Also freeze all of the low level obs ignore_mask = dummy_env.env._get_obs_mask() freeze_mask, _ = dummy_env.unwrapped._get_pro_ext_mask() freeze_mask = np.concatenate([freeze_mask, [0]]) if ('normalize' in env_opt and not env_opt['normalize']) or args.algo == 'dqn': ignore_mask = 1 - freeze_mask if model_opt['mode'] == 'hierarchical_many': # Actually ignore both ignored values and the low level values # That filtering will happen later ignore_mask = (ignore_mask + freeze_mask > 0).astype(float) envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) else: envs = ObservationFilter(envs, ret=alg_opt['norm_ret'], has_timestep=True, noclip=env_opt['step_plus_noclip'], ignore_mask=ignore_mask, freeze_mask=freeze_mask, time_scale=env_opt['time_scale'], gamma=env_opt['gamma']) # Make our helper object for dealing with hierarchical observations hier_utils = HierarchyUtils(ll_obs_shape, hl_obs_shape, hl_action_space, theta_sz, add_timestep) # Set up algo monitoring alg_filename = os.path.join(logpath, 'Alg.Monitor.csv') alg_f = open(alg_filename, "wt") alg_f.write('# Alg Logging %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] alg_logger = csv.DictWriter(alg_f, fieldnames=alg_fields) alg_logger.writeheader() alg_f.flush() ll_alg_filename = os.path.join(logpath, 'AlgLL.Monitor.csv') ll_alg_f = open(ll_alg_filename, "wt") ll_alg_f.write('# Alg Logging LL %s\n' % json.dumps({ "t_start": time.time(), 'env_id': dummy_env.spec and dummy_env.spec.id, 'mode': options['model']['mode'], 'name': options['logs']['exp_name'] })) ll_alg_fields = ['value_loss', 'action_loss', 'dist_entropy'] ll_alg_logger = csv.DictWriter(ll_alg_f, fieldnames=ll_alg_fields) ll_alg_logger.writeheader() ll_alg_f.flush() # Create the policy networks ll_action_space = envs.action_space if args.algo == 'dqn': model_opt['eps_start'] = optim_opt['eps_start'] model_opt['eps_end'] = optim_opt['eps_end'] model_opt['eps_decay'] = optim_opt['eps_decay'] hl_policy = DQNPolicy(hl_obs_shape, hl_action_space, model_opt) else: hl_policy = Policy(hl_obs_shape, hl_action_space, model_opt) if model_opt['mode'] == 'hierarchical_many': ll_policy = ModularPolicy(ll_raw_obs_shape, ll_action_space, theta_sz, ll_opt) else: ll_policy = Policy(ll_obs_shape, ll_action_space, ll_opt['model']) # Load the previous ones here? if args.cuda: hl_policy.cuda() ll_policy.cuda() # Create the high level agent if args.algo == 'a2c': hl_agent = algo.A2C_ACKTR(hl_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': hl_agent = algo.PPO(hl_policy, alg_opt['clip_param'], alg_opt['ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': hl_agent = algo.A2C_ACKTR(hl_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) elif args.algo == 'dqn': hl_agent = algo.DQN(hl_policy, env_opt['gamma'], batch_size=alg_opt['batch_size'], target_update=alg_opt['target_update'], mem_capacity=alg_opt['mem_capacity'], lr=optim_opt['lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) # Create the low level agent # If only training high level, make dummy agent (just does passthrough, doesn't change anything) if optim_opt['hierarchical_mode'] == 'train_highlevel': ll_agent = algo.Passthrough(ll_policy) elif optim_opt['hierarchical_mode'] == 'train_both': if args.algo == 'a2c': ll_agent = algo.A2C_ACKTR(ll_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['ll_lr'], eps=optim_opt['eps'], alpha=optim_opt['alpha'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'ppo': ll_agent = algo.PPO(ll_policy, alg_opt['clip_param'], alg_opt['ll_ppo_epoch'], alg_opt['num_mini_batch'], alg_opt['value_loss_coef'], alg_opt['entropy_coef'], lr=optim_opt['ll_lr'], eps=optim_opt['eps'], max_grad_norm=optim_opt['max_grad_norm']) elif args.algo == 'acktr': ll_agent = algo.A2C_ACKTR(ll_policy, alg_opt['value_loss_coef'], alg_opt['entropy_coef'], acktr=True) else: raise NotImplementedError # Make the rollout structures hl_rollouts = RolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], hl_obs_shape, hl_action_space, hl_policy.state_size) ll_rollouts = MaskingRolloutStorage(alg_opt['num_steps'], alg_opt['num_processes'], ll_obs_shape, ll_action_space, ll_policy.state_size) hl_current_obs = torch.zeros(alg_opt['num_processes'], *hl_obs_shape) ll_current_obs = torch.zeros(alg_opt['num_processes'], *ll_obs_shape) # Helper functions to update the current obs def update_hl_current_obs(obs): shape_dim0 = hl_obs_shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: hl_current_obs[:, :-shape_dim0] = hl_current_obs[:, shape_dim0:] hl_current_obs[:, -shape_dim0:] = obs def update_ll_current_obs(obs): shape_dim0 = ll_obs_shape[0] obs = torch.from_numpy(obs).float() if env_opt['num_stack'] > 1: ll_current_obs[:, :-shape_dim0] = ll_current_obs[:, shape_dim0:] ll_current_obs[:, -shape_dim0:] = obs # Update agent with loaded checkpoint if len(args.resume) > 0: # This should update both the policy network and the optimizer ll_agent.load_state_dict(ckpt['ll_agent']) hl_agent.load_state_dict(ckpt['hl_agent']) # Set ob_rms envs.ob_rms = ckpt['ob_rms'] else: if model_opt['mode'] == 'hierarchical_many': ll_agent.load_pretrained_policies(lowlevel_ckpts) else: # Load low level agent ll_agent.load_state_dict(lowlevel_ckpt['agent']) # Load ob_rms from low level (but need to reshape it) old_rms = lowlevel_ckpt['ob_rms'] assert (old_rms.mean.shape[0] == ll_obs_shape[0]) # Only copy the pro state part of it (not including thetas or count) envs.ob_rms.mean[:s_pro_dummy. shape[0]] = old_rms.mean[:s_pro_dummy.shape[0]] envs.ob_rms.var[:s_pro_dummy.shape[0]] = old_rms.var[:s_pro_dummy. shape[0]] # Reset our env and rollouts raw_obs = envs.reset() hl_obs, raw_ll_obs, step_counts = hier_utils.seperate_obs(raw_obs) ll_obs = hier_utils.placeholder_theta(raw_ll_obs, step_counts) update_hl_current_obs(hl_obs) update_ll_current_obs(ll_obs) hl_rollouts.observations[0].copy_(hl_current_obs) ll_rollouts.observations[0].copy_(ll_current_obs) ll_rollouts.recent_obs.copy_(ll_current_obs) if args.cuda: hl_current_obs = hl_current_obs.cuda() ll_current_obs = ll_current_obs.cuda() hl_rollouts.cuda() ll_rollouts.cuda() # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([alg_opt['num_processes'], 1]) final_rewards = torch.zeros([alg_opt['num_processes'], 1]) # Update loop start = time.time() for j in range(start_update, num_updates): for step in range(alg_opt['num_steps']): # Step through high level action start_time = time.time() with torch.no_grad(): hl_value, hl_action, hl_action_log_prob, hl_states = hl_policy.act( hl_rollouts.observations[step], hl_rollouts.states[step], hl_rollouts.masks[step]) hl_cpu_actions = hl_action.squeeze(1).cpu().numpy() if args.profile: print('hl act %f' % (time.time() - start_time)) # Get values to use for Q learning hl_state_dqn = hl_rollouts.observations[step] hl_action_dqn = hl_action # Update last ll observation with new theta for proc in range(alg_opt['num_processes']): # Update last observations in memory last_obs = ll_rollouts.observations[ll_rollouts.steps[proc], proc] if hier_utils.has_placeholder(last_obs): new_last_obs = hier_utils.update_theta( last_obs, hl_cpu_actions[proc]) ll_rollouts.observations[ll_rollouts.steps[proc], proc].copy_(new_last_obs) # Update most recent observations (not necessarily the same) assert (hier_utils.has_placeholder( ll_rollouts.recent_obs[proc])) new_last_obs = hier_utils.update_theta( ll_rollouts.recent_obs[proc], hl_cpu_actions[proc]) ll_rollouts.recent_obs[proc].copy_(new_last_obs) assert (ll_rollouts.observations.max().item() < float('inf') and ll_rollouts.recent_obs.max().item() < float('inf')) # Given high level action, step through the low level actions death_step_mask = np.ones([alg_opt['num_processes'], 1]) # 1 means still alive, 0 means dead hl_reward = torch.zeros([alg_opt['num_processes'], 1]) hl_obs = [None for i in range(alg_opt['num_processes'])] for ll_step in range(optim_opt['num_ll_steps']): # Sample actions start_time = time.time() with torch.no_grad(): ll_value, ll_action, ll_action_log_prob, ll_states = ll_policy.act( ll_rollouts.recent_obs, ll_rollouts.recent_s, ll_rollouts.recent_masks, deterministic=ll_deterministic) ll_cpu_actions = ll_action.squeeze(1).cpu().numpy() if args.profile: print('ll act %f' % (time.time() - start_time)) # Observe reward and next obs raw_obs, ll_reward, done, info = envs.step( ll_cpu_actions, death_step_mask) raw_hl_obs, raw_ll_obs, step_counts = hier_utils.seperate_obs( raw_obs) ll_obs = [] for proc in range(alg_opt['num_processes']): if (ll_step == optim_opt['num_ll_steps'] - 1) or done[proc]: ll_obs.append( hier_utils.placeholder_theta( np.array([raw_ll_obs[proc]]), np.array([step_counts[proc]]))) else: ll_obs.append( hier_utils.append_theta( np.array([raw_ll_obs[proc]]), np.array([hl_cpu_actions[proc]]), np.array([step_counts[proc]]))) ll_obs = np.concatenate(ll_obs, 0) ll_reward = torch.from_numpy( np.expand_dims(np.stack(ll_reward), 1)).float() episode_rewards += ll_reward hl_reward += ll_reward # Update values for Q learning and update replay memory time.time() hl_next_state_dqn = torch.from_numpy(raw_hl_obs) hl_reward_dqn = ll_reward hl_isdone_dqn = done if args.algo == 'dqn': hl_agent.update_memory(hl_state_dqn, hl_action_dqn, hl_next_state_dqn, hl_reward_dqn, hl_isdone_dqn, death_step_mask) hl_state_dqn = hl_next_state_dqn if args.profile: print('dqn memory %f' % (time.time() - start_time)) # Update high level observations (only take most recent obs if we haven't see a done before now and thus the value is valid) for proc, raw_hl in enumerate(raw_hl_obs): if death_step_mask[proc].item() > 0: hl_obs[proc] = np.array([raw_hl]) # If done then clean the history of observations masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += ( 1 - masks ) * episode_rewards # TODO - actually not sure if I broke this logic, but this value is not used anywhere episode_rewards *= masks # TODO - I commented this out, which possibly breaks things if num_stack > 1. Fix later if necessary #if args.cuda: # masks = masks.cuda() #if current_obs.dim() == 4: # current_obs *= masks.unsqueeze(2).unsqueeze(2) #else: # current_obs *= masks # Update low level observations update_ll_current_obs(ll_obs) # Update low level rollouts ll_rollouts.insert(ll_current_obs, ll_states, ll_action, ll_action_log_prob, ll_value, ll_reward, masks, death_step_mask) # Update which ones have stepped to the end and shouldn't be updated next time in the loop death_step_mask *= masks # Update high level rollouts hl_obs = np.concatenate(hl_obs, 0) update_hl_current_obs(hl_obs) hl_rollouts.insert(hl_current_obs, hl_states, hl_action, hl_action_log_prob, hl_value, hl_reward, masks) # Check if we want to update lowlevel policy if ll_rollouts.isfull and all([ not hier_utils.has_placeholder( ll_rollouts.observations[ll_rollouts.steps[proc], proc]) for proc in range(alg_opt['num_processes']) ]): # Update low level policy assert (ll_rollouts.observations.max().item() < float('inf')) if optim_opt['hierarchical_mode'] == 'train_both': with torch.no_grad(): ll_next_value = ll_policy.get_value( ll_rollouts.observations[-1], ll_rollouts.states[-1], ll_rollouts.masks[-1]).detach() ll_rollouts.compute_returns(ll_next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) ll_value_loss, ll_action_loss, ll_dist_entropy = ll_agent.update( ll_rollouts) else: ll_value_loss = 0 ll_action_loss = 0 ll_dist_entropy = 0 ll_rollouts.after_update() # Update logger alg_info = {} alg_info['value_loss'] = ll_value_loss alg_info['action_loss'] = ll_action_loss alg_info['dist_entropy'] = ll_dist_entropy ll_alg_logger.writerow(alg_info) ll_alg_f.flush() # Update high level policy start_time = time.time() assert (hl_rollouts.observations.max().item() < float('inf')) if args.algo == 'dqn': hl_value_loss, hl_action_loss, hl_dist_entropy = hl_agent.update( alg_opt['updates_per_step'] ) # TODO - maybe log this loss properly else: with torch.no_grad(): hl_next_value = hl_policy.get_value( hl_rollouts.observations[-1], hl_rollouts.states[-1], hl_rollouts.masks[-1]).detach() hl_rollouts.compute_returns(hl_next_value, alg_opt['use_gae'], env_opt['gamma'], alg_opt['gae_tau']) hl_value_loss, hl_action_loss, hl_dist_entropy = hl_agent.update( hl_rollouts) hl_rollouts.after_update() if args.profile: print('hl update %f' % (time.time() - start_time)) # Update alg monitor for high level alg_info = {} alg_info['value_loss'] = hl_value_loss alg_info['action_loss'] = hl_action_loss alg_info['dist_entropy'] = hl_dist_entropy alg_logger.writerow(alg_info) alg_f.flush() # Save checkpoints total_num_steps = (j + 1) * alg_opt['num_processes'] * alg_opt[ 'num_steps'] * optim_opt['num_ll_steps'] if 'save_interval' in alg_opt: save_interval = alg_opt['save_interval'] else: save_interval = 100 if j % save_interval == 0: # Save all of our important information start_time = time.time() save_checkpoint(logpath, ll_agent, hl_agent, envs, j, total_num_steps) if args.profile: print('save checkpoint %f' % (time.time() - start_time)) # Print log log_interval = log_opt['log_interval'] * alg_opt['log_mult'] if j % log_interval == 0: end = time.time() print( "{}: Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(options['logs']['exp_name'], j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), hl_dist_entropy, hl_value_loss, hl_action_loss)) # Do dashboard logging vis_interval = log_opt['vis_interval'] * alg_opt['log_mult'] if args.vis and j % vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs dashboard.visdom_plot() except IOError: pass # Save final checkpoint save_checkpoint(logpath, ll_agent, hl_agent, envs, j, total_num_steps) # Close logging file alg_f.close() ll_alg_f.close()
class a2c(object): def __init__(self, envs, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] # self.next_state_pred_ = hparams['next_state_pred_'] # Policy and Value network # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop( params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print('no opt specified') self.action_shape = 1 if hparams['gif_'] or hparams['ls_'] or hparams['vae_'] or hparams[ 'grad_var_']: self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] value, action, action_log_probs, dist_entropy = self.actor_critic.act( current_state) return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy, next_state_pred): #, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) # self.rollouts.insert_state_pred(next_state_pred) if 'traj_action_mask' in self.hparams and self.hparams[ 'traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic( Variable(self.rollouts.states[-1], volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view( self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * action_log_probs).mean() cost = action_loss + value_loss * self.value_loss_coef - dist_entropy.mean( ) * self.entropy_coef #*10. self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step() # def update2(self, discrim_error): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = -discrim_error - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() # #with reverse # def update2(self, discrim_error, discrim_error_reverse): # # discrim_error: [S,P] # # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # # next_value = next_value.data # # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # # print (torch.mean(discrim_error, dim=0)) # # print (discrim_error) # discrim_error_unmodified = discrim_error.data.clone() # discrim_error = discrim_error.data # # self.returns[-1] = next_value # divide_by = torch.ones(self.num_processes).cuda() # for step in reversed(range(discrim_error.size(0)-1)): # divide_by += 1 # ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error_unmodified[step] = ttmp + discrim_error_unmodified[step] # discrim_error[step] = discrim_error_unmodified[step] / divide_by # divide_by = divide_by * torch.squeeze(self.rollouts.masks[step+1]) # discrim_error = Variable(discrim_error.view(self.num_steps,self.num_processes,1)) # # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1)#[S,P,1] # # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) # self.rollouts.value_preds = [] # self.rollouts.action_log_probs = [] # self.rollouts.dist_entropy = [] # self.rollouts.state_preds = [] # # advantages = Variable(self.rollouts.returns[:-1]) - values # # print (values) # # print (discrim_error_reverse.size()) #[S,P] # discrim_error_reverse = discrim_error_reverse.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # baseline = torch.mean(val_to_maximize) # advantages = val_to_maximize - baseline #- values #(-.7)#values # # value_loss = advantages.pow(2).mean() # # action_loss = -(advantages.detach() * action_log_probs).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() # # print (grad_sum) # # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # # cost =- grad_sum # self.optimizer.zero_grad() # cost.backward() # nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) # self.optimizer.step() #avg empowrement rather than avg error def update2(self, discrim_error, discrim_error_reverse): # discrim_error: [S,P] # next_value = self.actor_critic(Variable(self.rollouts.states[-1], volatile=True))[0].data # next_value, _, _, _ = self.actor_critic.act(Variable(self.rollouts.states[-1], volatile=True), context_onehot) # next_value = next_value.data # self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # print (torch.mean(discrim_error, dim=0)) # print (discrim_error) discrim_error_reverse = discrim_error_reverse.view( self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view( self.num_steps, self.num_processes, 1) #[S,P,1] discrim_error = discrim_error.view(self.num_steps, self.num_processes, 1) # val_to_maximize = (-discrim_error + discrim_error_reverse)/2. - action_log_probs.detach() #[S,P,1] val_to_maximize = -discrim_error - action_log_probs.detach() #[S,P,1] val_to_maximize = val_to_maximize.view(self.num_steps, self.num_processes) #[S,P] discrim_error_unmodified = val_to_maximize.data.clone() discrim_error = val_to_maximize.data # self.returns[-1] = next_value divide_by = torch.ones(self.num_processes).cuda() for step in reversed(range(discrim_error.size(0) - 1)): divide_by += 1 ttmp = discrim_error_unmodified[step + 1] * self.gamma * torch.squeeze( self.rollouts.masks[step + 1]) discrim_error_unmodified[ step] = ttmp + discrim_error_unmodified[step] discrim_error[step] = discrim_error_unmodified[step] / divide_by divide_by = divide_by * torch.squeeze( self.rollouts.masks[step + 1]) val_to_maximize = Variable( discrim_error.view(self.num_steps, self.num_processes, 1)) # discrim_error = discrim_error.view(self.num_processes*self.num_steps, 1).detach() # values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) #[S,P,1] # dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] self.rollouts.state_preds = [] # advantages = Variable(self.rollouts.returns[:-1]) - values # print (values) # print (discrim_error_reverse.size()) #[S,P] # val_to_maximize = (-discrim_error + discrim_error_reverse.detach())/2. - action_log_probs.detach() # val_to_maximize = discrim_error baseline = torch.mean(val_to_maximize) advantages = val_to_maximize - baseline #- values #(-.7)#values # value_loss = advantages.pow(2).mean() # action_loss = -(advantages.detach() * action_log_probs).mean() action_loss = -(advantages.detach() * action_log_probs).mean() # print (grad_sum) # cost = action_loss - dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. cost = action_loss #- dist_entropy.mean()*self.entropy_coef # + value_loss*self.value_loss_coef # - grad_sum*100000. # cost = value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef - grad_sum*500. # cost =- grad_sum self.optimizer.zero_grad() cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): print("######") print("HELLO! Returns start with infinity values") print("######") os.environ['OMP_NUM_THREADS'] = '1' if args.random_task: env_params = { 'wt': np.round(np.random.uniform(0.5, 1.0), 2), 'x': np.round(np.random.uniform(-0.1, 0.1), 2), 'y': np.round(np.random.uniform(-0.1, 0.1), 2), 'z': np.round(np.random.uniform(0.15, 0.2), 2), } else: env_params = { 'wt': args.euclidean_weight, 'x': args.goal_x, 'y': args.goal_y, 'z': args.goal_z, } envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params) for i in range(args.num_processes)] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, ob=False) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() actor_critic.input_norm.update(rollouts.observations[0]) last_return = -np.inf best_return = -np.inf best_models = None start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) actor_critic.input_norm.update(rollouts.observations[step + 1]) next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator(advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator(advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if args.vis and j % args.vis_interval == 0: last_return = plot(logger, args.log_dir) if last_return > best_return: best_return = last_return try: os.makedirs(os.path.dirname(args.save_path)) except OSError: pass info = { 'return': best_return, 'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon) } # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save((save_model, env_params, info), args.save_path) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}". format(j, total_num_steps, int(total_num_steps / (end - start)), last_return, best_return, value_loss.data[0], action_loss.data[0]))
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() lmdb_idx = 0 try: os.makedirs(os.path.join(args.lmdb_path, args.env_name)) os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test')) except: print('Directory already exists.') for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs # obs, reward, done, info = envs.step(cpu_actions) '''unwrapped obs, reward''' obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions) # sample images # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2) for img, rwd in zip(wr_obs, wr_reward): if rwd > 0: lmdb_idx += 1 convert_to_lmdb( img, rwd, os.path.join(args.lmdb_path, args.env_name), lmdb_idx) # Evaluate unwrapped rewards # model = Model() # model.load(args.digit_checkpoint) # model.cuda() # accuracy = digit_eval(image, length_labels, digits_labels, model) # img.show() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) actor_critic = torch.load( os.path.join("trained_models/a2c/HopperSmallFoot-v0.pt")) actor_critic.eval() if args.cuda: current_obs = current_obs.cuda() actor_critic.cuda() rollouts.cuda() reward_all = [] count = 0 while count < args.nb_episodes: switch = True temp_reward = 0 while switch: value, action = actor_critic.act(Variable(current_obs, volatile=True), deterministic=True) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() temp_reward += reward.cpu().numpy()[0][0] update_current_obs(obs) if done[0]: reward_all.append(temp_reward) switch = False count += 1 obs = envs.reset() update_current_obs(obs) print(reward_all) reward_all = reward_all[1:] print(len(reward_all)) print(np.mean(reward_all), '+/-', np.std(reward_all))
class a2c(object): def __init__(self, hparams): self.use_gae = hparams['use_gae'] self.gamma = hparams['gamma'] self.tau = hparams['tau'] self.obs_shape = hparams['obs_shape'] self.num_steps = hparams['num_steps'] self.num_processes = hparams['num_processes'] self.value_loss_coef = hparams['value_loss_coef'] self.entropy_coef = hparams['entropy_coef'] self.cuda = hparams['cuda'] self.opt = hparams['opt'] self.grad_clip = hparams['grad_clip'] self.action_size = hparams['action_size'] # Policy and Value network # if hparams['dropout'] == True: # print ('CNNPolicy_dropout2') # self.actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space) # elif len(envs.observation_space.shape) == 3: # print ('CNNPolicy2') # self.actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space) # if 'traj_action_mask' in hparams and hparams['traj_action_mask']: # self.actor_critic = CNNPolicy_trajectory_action_mask(self.obs_shape[0], envs.action_space) # else: # self.actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space) self.actor_critic = CNNPolicy(self.obs_shape[0], self.action_size) # #for batch norm # self.actor_critic.train() #self.actor_critic.eval() # Storing rollouts self.rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, self.action_size) if self.cuda: self.actor_critic.cuda() self.rollouts.cuda() #Optimizer if self.opt == 'rms': self.optimizer = optim.RMSprop(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) elif self.opt == 'adam': self.optimizer = optim.Adam(params=self.actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps']) elif self.opt == 'sgd': self.optimizer = optim.SGD(params=self.actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom']) else: print ('no opt specified') # if envs.action_space.__class__.__name__ == "Discrete": # action_shape = 1 # else: # action_shape = envs.action_space.shape[0] # self.action_shape = action_shape # self.action_shape = 1 # # if __: # # self.deterministic_action = 0 # # else: # # self.deterministic_action = 0 # if hparams['gif_'] or hparams['ls_']: # self.rollouts_list = RolloutStorage_list() self.hparams = hparams def act(self, current_state): # value, action = self.actor_critic.act(current_state) # [] [] [P,1] [P] # print ('aaa') # print (self.actor_critic.act(current_state)) value, action, action_log_probs, dist_entropy = self.actor_critic.act(current_state) # print ('lll') return value, action, action_log_probs, dist_entropy def insert_first_state(self, current_state): self.rollouts.states[0].copy_(current_state) def insert_data(self, step, current_state, action, value, reward, masks, action_log_probs, dist_entropy):#, done): self.rollouts.insert(step, current_state, action, value, reward, masks, action_log_probs, dist_entropy) if 'traj_action_mask' in self.hparams and self.hparams['traj_action_mask']: self.actor_critic.reset_mask(done) def update(self): next_value = self.actor_critic(Variable(self.rollouts.states[-1] / 255., volatile=True))[0].data self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.tau) # values, action_log_probs, dist_entropy = self.actor_critic.evaluate_actions( # Variable(self.rollouts.states[:-1].view(-1, *self.obs_shape)), # Variable(self.rollouts.actions.view(-1, self.action_shape))) values = torch.cat(self.rollouts.value_preds, 0).view(self.num_steps, self.num_processes, 1) action_log_probs = torch.cat(self.rollouts.action_log_probs).view(self.num_steps, self.num_processes, 1) dist_entropy = torch.cat(self.rollouts.dist_entropy).view(self.num_steps, self.num_processes, 1) self.rollouts.value_preds = [] self.rollouts.action_log_probs = [] self.rollouts.dist_entropy = [] advantages = Variable(self.rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() self.optimizer.zero_grad() cost = action_loss + value_loss*self.value_loss_coef - dist_entropy.mean()*self.entropy_coef cost.backward() nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.grad_clip) self.optimizer.step()
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass
def main(): print("#######") print( "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_state = torch.zeros(args.num_processes, *obs_shape) def update_current_state(state): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state state = envs.reset() update_current_state(state) rollouts.states[0].copy_(current_state) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda() rollouts.cuda() if args.algo == 'ppo': old_model = copy.deepcopy(actor_critic) for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next state state, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks update_current_state(state) rollouts.insert(step, current_state, action.data, value.data, reward, masks) next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) old_model.load_state_dict(actor_critic.state_dict()) if hasattr(actor_critic, 'obs_filter'): old_model.obs_filter = actor_critic.obs_filter for _ in range(args.ppo_epoch): sampler = BatchSampler(SubsetRandomSampler( range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False) for indices in sampler: indices = torch.LongTensor(indices) if args.cuda: indices = indices.cuda() states_batch = rollouts.states[:-1].view( -1, *obs_shape)[indices] actions_batch = rollouts.actions.view( -1, action_shape)[indices] return_batch = rollouts.returns[:-1].view(-1, 1)[indices] # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(states_batch), Variable(actions_batch)) _, old_action_log_probs, _ = old_model.evaluate_actions( Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True)) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data)) adv_targ = Variable(advantages.view(-1, 1)[indices]) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: print( "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, (j + 1) * args.num_processes * args.num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if j % args.vis_interval == 0: win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = env.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: print('cuda is available..3') current_obs = current_obs.cuda() rollouts.cuda() from visualize import plot_rewards episodic_reward_graph = [] start = time.time() cnt = 0 for j in range(num_updates): if j % 100 == 0: print('num update: ', j) for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, states, _, _ = actor_critic.act( rollouts.observations[step], rollouts.states[step],
def main(): print("###############################################################") print("#################### VISDOOM LEARNER START ####################") print("###############################################################") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None global envs envs = VecEnv( [make_env(i, args.config_path) for i in range(args.num_processes)], logging=True, log_dir=args.log_dir) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'a2c' or args.algo == 'acktr': actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) elif args.algo == 'a2t': source_models = [] files = glob.glob(os.path.join(args.source_models_path, '*.pt')) for file in files: print(file, 'loading model...') source_models.append(torch.load(file)) actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape, source_models) elif args.algo == 'resnet': # args.num_stack = 3 actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() if args.algo == 'a2c' or args.algo == 'resnet': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'a2t': a2t_params = [p for p in actor_critic.parameters() if p.requires_grad] optimizer = optim.RMSprop(a2t_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # print ('Actions:', cpu_actions, 'Rewards:', reward) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c' or args.algo == 'resnet': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) elif args.algo == 'a2t': nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo) except IOError: pass envs.close() time.sleep(5)
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) # fsdaf # Create environment envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # action_shape = action_shape # shape_dim0 = envs.observation_space.shape[0] # if args.cuda: # dtype = torch.cuda.FloatTensor # else: # dtype = torch.FloatTensor hparams = {'cuda':args.cuda, 'num_steps':args.num_steps, 'num_processes':args.num_processes, 'obs_shape':obs_shape, 'lr':args.lr, 'eps':args.eps, 'alpha':args.alpha, 'use_gae':args.use_gae, 'gamma':args.gamma, 'tau':args.tau, 'value_loss_coef':args.value_loss_coef, 'entropy_coef':args.entropy_coef} # Create agent # agent = a2c(envs, hparams) # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if args.cuda: actor_critic.cuda() # rollouts.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # Init state current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype) def update_current_state(state):#, shape_dim0): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state # return current_state state = envs.reset() update_current_state(state)#, shape_dim0) # agent.insert_first_state(current_state) rollouts.states[0].copy_(current_state) #set the first state to current state # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda()#type(dtype) # if args.cuda: rollouts.cuda() #Begin training start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Act # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width] # Record rewards # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks # return reward, masks, final_rewards, episode_rewards, current_state # Update state update_current_state(state)#, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks) rollouts.insert(step, current_state, action.data, value.data, reward, masks) #Optimize agent # agent.update() next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # #Save model # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #Print updates if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # final_rewards.mean(), # final_rewards.median(), # final_rewards.min(), # final_rewards.max(), # end - start))#, -dist_entropy.data[0], # # value_loss.data[0], action_loss.data[0])) # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}". # format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start)) if j % (args.log_interval*30) == 0: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))
def main(): print("#######") print("WARNING: All rewards are not clipped or normalized ") print("#######") os.environ['OMP_NUM_THREADS'] = '1' envs = rafiki.Envs(args.num_processes, args.num_models, args.policy, args.beta, args.obs_size, args.max_latency, args.tau, args.cycle_len) obs_shape = envs.observation_space.shape actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() info_set = Info(args) for j in range(num_updates): for step in range(args.num_steps): logger.info('------------%d----------------' % j) # Sample actions with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[step])) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs logger.info(probs) obs, reward, info = envs.step(cpu_actions) info_set.insert(info) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() update_current_obs(obs) rollouts.insert(step, current_obs, action.data, action_log_prob.data, reward) if args.algo in ['a2c', 'ppo']: action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) R = rollouts.rewards.detach() optimizer.zero_grad() policy_loss = -R.reshape(args.num_steps, args.num_processes).mul(action_log_probs) policy_loss = sum(policy_loss) / len(policy_loss) policy_loss.backward() # nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm) optimizer.step() with torch.no_grad(): action, probs, action_log_prob = actor_critic.act( Variable(rollouts.observations[-1])) logger.info(probs) rollouts.after_update() if j % args.log_interval == 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps print("Updates {}, num timesteps {}, reward {}, policy loss {}". format(j, total_num_steps, R.data, policy_loss.reshape(-1).data)) logger.info(args) info_set.show()