def load_policy_model(args, environment, device, folder=None): parent_folder = './checkpoint/policy' path = folder if folder is not None else parent_folder model = Policy(environment['action'], net=args.encoder, pretrained=args.pretrained, input=environment['input_size']) model.load_state_dict(torch.load(f'{path}/best_model.ckpt')) model = model.to(device) model.eval() return model
# Model and action size print('\nCreating Models') action_dimension = environment['action'] inputs = environment['input_size'] * 2 if environment[ 'input_size'] is not None else None policy_model = Policy(action_dimension, net=args.encoder, pretrained=args.pretrained, input=environment['input_size']) idm_model = IDM(action_dimension, net=args.encoder, pretrained=args.pretrained, input=inputs) policy_model.to(device) idm_model.to(device) # Optimizer and loss print('\nCreating Optimizer and Loss') print(f'IDM learning rate: {args.lr}\nPolicy learning rate: {args.policy_lr}') idm_lr = args.lr idm_criterion = nn.CrossEntropyLoss() idm_optimizer = optim.Adam(idm_model.parameters(), lr=idm_lr) policy_lr = args.policy_lr policy_criterion = nn.CrossEntropyLoss() policy_optimizer = optim.Adam(policy_model.parameters(), lr=policy_lr) # Learning rate decay print('Setting up Learning Rate Decay function and Schedulers')
class Agent: #algorithm algo = 'a2c' #a2c, ppo, acktr use_gae = False #generalized advantage estimation gae_lambda = 0.95 entropy_coef = 0.01 #weight maximizing action entropy loss value_loss_coef = 0.1 #.5 #weight value function loss max_grad_norm = 0.5 #max norm of gradients #ppo hyperparameters clip_param = 0.2 #ppo clip num_steps = 5 #steps before an update ppo_epoch = 4 num_mini_batch = 32 seed = 1 device = 'cuda' if torch.cuda.is_available() else 'cpu' cuda_deterministic = False no_cuda = False use_proper_time_limits = False use_linear_lr_decay = False #experimnent setup log_interval = 1 #log per n updates log_dir = os.path.expanduser('/tmp/gym') eval_log_dir = log_dir + "_eval" save_interval = 100 eval_interval = None recurrent_policy = True #optimization, RMSprop and TD eps = 1e-5 #epsilon alpha = 0.99 gamma = 0.99 #discount factor #imitation learning with gail gail_batch_size = 128 gail_epoch = 5 def __init__(self, env_def, processes=1, dir='.', version=0, lr=2e-4, architecture='base', dropout=0, reconstruct=None, r_weight=.05): self.env_def = env_def self.num_processes = processes #cpu processes self.lr = lr self.version = version self.save_dir = dir + '/trained_models/' #Setup pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True) if (self.num_mini_batch > processes): self.num_mini_batch = processes self.writer = SummaryWriter() self.total_steps = 0 #State torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) if not self.no_cuda and torch.cuda.is_available( ) and self.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True utils.cleanup_log_dir(self.log_dir) utils.cleanup_log_dir(self.eval_log_dir) torch.set_num_threads(1) self.level_path = None self.envs = None self.num_envs = -1 self.set_envs(num_envs=1) if (version > 0): self.actor_critic = self.load(path, version) else: self.actor_critic = Policy( self.envs.observation_space.shape, self.envs.action_space, base_kwargs={ 'recurrent': self.recurrent_policy, 'shapes': list(reversed(self.env_def.model_shape)), 'dropout': dropout }, model=architecture) self.actor_critic.to(self.device) #Reconstruction self.reconstruct = reconstruct is not None if (self.reconstruct): #layers = self.envs.observation_space.shape[0] #shapes = list(self.env_def.model_shape) #self.r_model = Decoder(layers, shapes=shapes).to(self.device) reconstruct.to(self.device) self.r_model = lambda x: reconstruct.adapter(reconstruct(x)) #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log() #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss() self.r_loss = lambda pred, true: -r_weight * (true * torch.log( pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean() self.r_optimizer = reconstruct.optimizer #optim.Adam(reconstruct.parameters(), lr = .0001) if self.algo == 'a2c': self.agent = A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, alpha=self.alpha, max_grad_norm=self.max_grad_norm) elif self.algo == 'ppo': self.agent = PPO(self.actor_critic, self.clip_param, self.ppo_epoch, self.num_mini_batch, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, max_grad_norm=self.max_grad_norm, use_clipped_value_loss=False) elif self.algo == 'acktr': self.agent = algo.A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, acktr=True) self.gail = False self.gail_experts_dir = './gail_experts' if self.gail: assert len(self.envs.observation_space.shape) == 1 self.gail_discr = gail.Discriminator( self.envs.observation_space.shape[0] + self.envs.action_space.shape[0], 100, self.device) file_name = os.path.join( self.gail_experts_dir, "trajs_{}.pt".format(env_name.split('-')[0].lower())) self.gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=self.gail_batch_size, shuffle=True, drop_last=True) self.rollouts = RolloutStorage( self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, self.actor_critic.recurrent_hidden_state_size) def load(self, path, version): policy = torch.load(os.path.join(path, "agent_{}.tar".format(version))) #utils.get_vec_normalize(self.envs).ob_rms = ob_rms self.actor_critic = policy def save(self, path, version): #ob_rms = getattr(utils.get_vec_normalize(self.envs), 'ob_rms', None) torch.save(self.actor_critic, os.path.join(path, "agent_{}.tar".format(version))) def report(self, version, total_num_steps, FPS, rewards): file_path = os.path.join(self.save_dir, "actor_critic_results.csv") add_header = not os.path.exists(file_path) if (len(rewards) > 0): mean, median, min, max = np.mean(rewards), np.median( rewards), np.min(rewards), np.max(rewards) else: mean, median, min, max = np.nan, np.nan, np.nan, np.nan with open(file_path, 'a+') as results: writer = csv.writer(results) if (add_header): header = [ 'update', 'total_steps', 'FPS', 'mean_reward', 'median_reward', 'min_reward', 'max_reward' ] writer.writerow(header) writer.writerow( (version, total_num_steps, FPS, mean, median, min, max)) def set_envs(self, level_path=None, num_envs=None): num_envs = num_envs if num_envs else self.num_processes if (level_path != self.level_path or self.envs is None or num_envs != self.num_envs): if (self.envs is not None): self.envs.close() self.level_path = level_path self.envs = make_vec_envs(self.env_def, level_path, self.seed, num_envs, self.gamma, self.log_dir, self.device, True) self.num_envs = num_envs def update_reconstruction(self, rollouts): s, p, l, w, h = list(rollouts.obs.size()) x = rollouts.obs.view(-1, l, w, h) hidden = rollouts.recurrent_hidden_states.view(s * p, -1) mask = rollouts.masks.view(s * p, -1) #y = x.argmax(1) y = x self.r_optimizer.zero_grad() self.agent.optimizer.zero_grad() _, predictions, _ = self.actor_critic.base(x, hidden, mask) reconstructions = self.r_model(predictions) loss = self.r_loss(reconstructions, y) loss.backward() self.r_optimizer.step() self.agent.optimizer.step() return loss def update_reconstruct_next(self, rollouts): #Mask frames that are not relevant mask = rollouts.masks.unfold(0, 2, 1).min(-1)[0] mask = mask.view(-1) mask = torch.nonzero(mask).squeeze() #Image Pairs l, w, h = list(rollouts.obs.size())[2:] img_pairs = rollouts.obs.unfold(0, 2, 1) #128, 8, 14, 12, 16, 2 img_pairs = img_pairs.view(-1, l, w, h, 2) img_pairs = img_pairs[mask] x = img_pairs[:, :, :, :, 0] y = img_pairs[:, :, :, :, 1] #Input hidden states hidden_size = rollouts.recurrent_hidden_states.size(2) hidden = rollouts.recurrent_hidden_states[:-1].view( -1, hidden_size) #129, 8, 512 hidden = hidden[mask] #Update model self.r_optimizer.zero_grad() mask = torch.ones_like(mask).float().unsqueeze(1) _, predictions, _ = self.actor_critic.base(x, hidden, mask) reconstructions = self.r_model(predictions) loss = self.r_loss( reconstructions, y) #model -> x or x and a? x already contains action features loss.backward() self.r_optimizer.step() print(loss.item()) #add loss weight return loss def play(self, env, runs=1, visual=False): env = GridGame() reward_mean = 0 for i in range(runs): score = self.play_game(env, visual) reward_mean += score / runs return score_mean def play_game(self, level): eval_envs = make_vec_envs(env_name, self.seed + self.num_processes, self.num_processes, None, eval_log_dir, device, True) vec_norm = utils.get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( self.num_processes, self.actor_critic.recurrent_hidden_state_size).to(self.device) eval_masks = torch.zeros(self.num_processes, 1).to(self.device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, _, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32).to(device) if (done): print("Done!") eval_envs.close() def train_agent(self, num_env_steps): env_name = self.env_def.name obs = self.envs.reset() self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) n = 30 episode_rewards = deque(maxlen=n) episode_values = deque(maxlen=n) episode_end_values = deque(maxlen=n) episode_end_probs = deque(maxlen=n) episode_lengths = deque(maxlen=n) compile_est = deque(maxlen=n) first_steps = [True for i in range(self.num_processes)] start = time.time() num_updates = int( num_env_steps) // self.num_steps // self.num_processes for j in range(num_updates): if self.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( self.agent.optimizer, j, num_updates, self.agent.optimizer.lr if self.algo == "acktr" else self.lr) for step in range(self.num_steps): # Sample actions with torch.no_grad(): value, Q, action, action_prob, action_log_prob, recurrent_hidden_states = \ self.actor_critic.act(self.rollouts.obs[step], self.rollouts.recurrent_hidden_states[step], self.rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = self.envs.step(action) for i, step in enumerate(first_steps): if step: episode_values.append(value[i].item()) elif (done[i]): episode_end_values.append(Q[i].item()) episode_end_probs.append(action_log_prob[i].item()) first_steps = done for worker, info in enumerate(infos): if 'episode' in info.keys(): r = info['episode']['r'] l = info['episode']['l'] episode_rewards.append(r) episode_lengths.append(l) if (r < -1): compile_est.append(value[worker].item()) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) self.rollouts.insert(obs, recurrent_hidden_states, action, action_prob, action_log_prob, value, Q, reward, masks, bad_masks) with torch.no_grad(): next_value = self.actor_critic.get_value( self.rollouts.obs[-1], self.rollouts.recurrent_hidden_states[-1], self.rollouts.masks[-1]).detach() if self.gail: if j >= 10: self.envs.venv.eval() gail_epoch = self.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): self.gail_discr.update( self.gail_train_loader, self.rollouts, utils.get_vec_normalize(self.envs)._obfilt) for step in range(self.num_steps): self.rollouts.rewards[ step] = self.gail_discr.predict_reward( self.rollouts.obs[step], self.rollouts.actions[step], self.gamma, self.rollouts.masks[step]) self.rollouts.compute_returns(next_value, self.use_gae, self.gamma, self.gae_lambda, self.use_proper_time_limits) value_loss, action_loss, dist_entropy = self.agent.update( self.rollouts) if (self.reconstruct): recon_loss = self.update_reconstruction(self.rollouts) self.writer.add_scalar('generator/Reconstruction Loss', recon_loss.item(), self.total_steps) self.rollouts.after_update() #Tensorboard Reporting self.total_steps += self.num_processes * self.num_steps self.writer.add_scalar('value/Mean Reward', np.mean(episode_rewards), self.total_steps) self.writer.add_scalar('value/Episode Mean Length', np.mean(episode_lengths), self.total_steps) self.writer.add_scalar('policy/Action Loss', action_loss, self.total_steps) self.writer.add_scalar('value/Value Loss', value_loss, self.total_steps) self.writer.add_scalar('policy/Distribution Entropy', dist_entropy, self.total_steps) self.writer.add_scalar('value/Win Probability', np.mean(np.array(episode_rewards) > 0), self.total_steps) self.writer.add_scalar('value/Starting Value', np.mean(episode_values), self.total_steps) #self.writer.add_scalar('value/Ending Value', np.mean(episode_end_values), self.total_steps) self.writer.add_scalar('value/Log Probs', np.mean(episode_end_probs), self.total_steps) if (len(compile_est) > 0): self.writer.add_scalar('value/Compile Estimate', np.mean(compile_est), self.total_steps) # save for every interval-th episode or for the last epoch total_num_steps = (j + 1) * self.num_processes * self.num_steps end = time.time() if (j % self.save_interval == 0 or j == num_updates - 1) and self.save_dir != "": self.version += 1 #self.save(self.version) self.report(self.version, total_num_steps, int(total_num_steps / (end - start)), episode_rewards) if j % self.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
except: shutil.rmtree(args.model_dir) os.makedirs(args.model_dir) env = deepmind_lab.Lab('tests/empty_room_test', ['RGB_INTERLEAVED'], config=CONFIG) env.reset() obs_shape = env.observations()['RGB_INTERLEAVED'].shape obs_shape = (obs_shape[2], obs_shape[0], obs_shape[1]) print('Observation Space: ', obs_shape) action_space = Discrete(9) env.close() actor_critic = Policy(obs_shape, action_space) actor_critic.to(args.device) learner = Learner(args, q_batch, actor_critic) for i in range(len(LEVELS)): print('Build Actor {:d}'.format(i)) rollouts = RolloutStorage(args.num_steps, 1, obs_shape, action_space, actor_critic.recurrent_hidden_state_size) actor_critic = Policy(obs_shape, action_space) actor_critic.to(args.device) actor_name = 'actor_' + str(i) actor = Actor(args, q_trace, learner, actor_critic, rollouts, LEVELS[i], actor_name) actors.append(actor)