class AgentA2C: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = args.remotes self.seed = 42 self.max_steps = 1e9 self.grad_norm = 0.5 self.entropy_weight = 0.05 self.eps = np.finfo(np.float32).eps.item() ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 1000 self.save_freq = 1 self.save_dir = './ckpts/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = MultiEnv() self.envs.configure(remotes=self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") observation = self.envs.reset() self.obs_shape = np.transpose(observation[0], (2, 0, 1)).shape self.act_shape = args.action_space self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) if args.test_a2c: self.load_model('./ckpts/model_1239.pt') self.hidden = None self.init_game_setting() def _update(self): # R_t = reward_t + gamma * R_{t+1} with torch.no_grad(): next_value, _, _ = self.model(self.rollouts.obs[-1], self.rollouts.hiddens[-1], self.rollouts.masks[-1]) self.rollouts.returns[-1] = next_value.detach() for step in reversed(range(self.rollouts.rewards.size(0))): self.rollouts.returns[step] = self.rollouts.rewards[step] + \ (self.rollouts.returns[step + 1] * \ self.gamma * \ self.rollouts.masks[step + 1]) # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) values, action_probs, _ = self.model( self.rollouts.obs[:-1].view(-1, self.obs_shape[0], self.obs_shape[1], self.obs_shape[2]), self.rollouts.hiddens[0], self.rollouts.masks[:-1].view(-1, 1)) distribution = torch.distributions.Categorical(action_probs) log_probs = distribution.log_prob( self.rollouts.actions.flatten()).flatten() returns = self.rollouts.returns[:-1].flatten() values = values.flatten() value_loss = F.smooth_l1_loss(returns, values) advantages = returns - values action_loss = -(log_probs * advantages.detach()).mean() entropy = distribution.entropy().mean() loss = value_loss + action_loss + (-self.entropy_weight * entropy) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical values, action_probs, hiddens = self.model(obs, hiddens, masks) actions = torch.distributions.Categorical(action_probs).sample() transformed_action = multiActionTransform(actions.cpu().numpy()) obs, rewards, dones, infos = self.envs.step(transformed_action) # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) obs = torch.from_numpy(obs).to(self.device).permute(0, 3, 1, 2) masks = torch.from_numpy(1 - dones).to(self.device) rewards = torch.from_numpy(rewards).to(self.device) penalty_rewards = (1 - masks) * -10 rewards = rewards + penalty_rewards.double() self.rollouts.insert(obs, hiddens, actions.unsqueeze(1), values, rewards.unsqueeze(1), masks.unsqueeze(1)) def train(self): print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~START TRAINING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) running_reward = deque(maxlen=self.update_freq * 2) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device).permute( 0, 3, 1, 2) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) max_reward = 0.0 counter = 0 continual_crash = 0 while True: try: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print( 'Steps: %d/%d | Avg reward: %f | Max reward: %f' % (total_steps, self.max_steps, avg_reward, max_reward)) with open('a2c_log.txt', 'a') as fout: fout.write(str(avg_reward) + '\n') if total_steps % self.save_freq == 0: self.save_model('model_{}.pt'.format(counter), avg_reward) counter += 1 if avg_reward > max_reward: max_reward = avg_reward self.save_model('model_max_{}.pt'.format(counter), max_reward) counter += 1 if total_steps >= self.max_steps: break continual_crash = 0 except Exception as e: continual_crash += 1 if continual_crash >= 10: print( '============================================================================================================================================' ) print(e) print("Crashed 10 times -- stopping u suck") print( '============================================================================================================================================' ) raise e else: print( '#############################################################################################################################################' ) print(e) print("Env crash, making new env") print( '#############################################################################################################################################' ) time.sleep(60) self.envs = MultiEnv(resize=(250, 150)) self.envs.configure(remotes=self.n_processes) time.sleep(60) def save_model(self, filename, max_reward): if not os.path.isdir(self.save_dir): os.mkdir(self.save_dir) print('model saved: ' + filename + ' (' + str(max_reward) + ')') torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): if use_cuda: self.model = torch.load(path) else: self.model = torch.load(path, map_location='cpu') def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): with torch.no_grad(): observation = torch.from_numpy(observation).float().permute( 0, 3, 1, 2).to(self.device) _, action_prob, hidden = self.model( observation, self.hidden, torch.ones(1, 1).to(self.device)) self.hidden = hidden action = torch.distributions.Categorical(action_prob).sample() return action.cpu().numpy()
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() #### def calc_actual_state_values(self, rewards, dones): R = [] rewards.reverse() # If we happen to end the set on a terminal state, set next return to zero if dones[-1] == True: next_return = 0 # If not terminal state, bootstrap v(s) using our critic # TODO: don't need to estimate again, just take from last value of v(s) estimates else: s = torch.from_numpy(self.rollouts.obs[-1]).float().unsqueeze( 0) #states next_return = self.model.get_state_value(Variable(s)).data[0][0] # Backup from last state to calculate "true" returns for each state in the set R.append(next_return) dones.reverse() for r in range(1, len(rewards)): if not dones[r]: this_return = rewards[r] + next_return * self.gamma else: this_return = 0 R.append(this_return) next_return = this_return R.reverse() state_values_true = Variable(torch.FloatTensor(R)).unsqueeze(1) return state_values_true #### def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} state_values_true = self.calc_actual_state_values( self.rollouts.rewards, self.rollouts.dones ) #(rewards, dones)#from storage: obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()); obs =state? # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) s = Variable(torch.FloatTensor(self.rollouts.obs)) action_probs, state_values_est, hiddens = self.model( s) #action_probs, state_values_est action_log_probs = action_probs.log() a = Variable(torch.LongTensor(self.rollouts.actions).view(-1, 1)) chosen_action_log_probs = action_log_probs.gather(1, a) # This is also the TD error advantages = state_values_true - state_values_est entropy = (action_probs * action_log_probs).sum(1).mean() action_loss = (chosen_action_log_probs * advantages).mean() value_loss = advantages.pow(2).mean() loss = value_loss + action_loss - 0.0001 * entropy #entropy_weight = 0.0001 # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) RolloutStorage.reset() ## return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): pass # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical actions, values, hiddens = self.make_action(obs, hiddens, masks) #print("##################################*****************",actions.cpu().numpy(),type(actions.cpu().numpy()),actions.cpu().numpy().shape) #print("##################################*****************",actions.max(1)[0].item()) obs, rewards, dones, infos = self.envs.step( actions.max(1)[0]) #.numpy().max(0)[0].item()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) self.rollouts.to(device) masks = 1 - dones self.rollouts.insert(obs, hiddens, actions, values, rewards, masks) self.rollouts.to(device) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) #torch.Size([16, 4, 84, 84]) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): print("# ******************step***********************", step) #print("self.rollouts.actions[step]", self.rollouts.actions[step]) # print("self.rollouts.obs[step]", self.rollouts.hiddens[step]) # print("self.rollouts.obs[step]", self.rollouts.masks[step]) self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, hiddens, masks, test=False): # TODO: Use you model to choose an action # if test == True: # observation = torch.from_numpy(observation).permute(2, 0, 1).unsqueeze(0).to(device) # print("!!!!!!!!!!!!!!",observation.shape) # state = torch.from_numpy(observation).float().unsqueeze(0) values, action_probs, hiddens = self.model(observation, hiddens, masks) # m = Categorical(action_probs) # action = m.sample() # #self.saved_actions.append(m.log_prob(action)) return action_probs, values, hiddens
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, seed=args.seed, digit=args.digit, rank=i, log_dir=args.log_dir, use_patience=args.use_patience) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) print(obs_shape) actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_lengths = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks episode_lengths += torch.ones(episode_lengths.size()) episode_lengths *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic.get_value( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True)).data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, Episode lengths {:.2f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0], episode_lengths.mean())) if j > 0 and j % args.vis_interval == 0: pass
def run(self): # (16, 4, 84, 84) current_obs = np.zeros([NUM_PROCESSES, *self.obs_shape]) episode_rewards = np.zeros([NUM_PROCESSES, 1]) final_rewards = np.zeros([NUM_PROCESSES, 1]) # torch.Size([16, 1, 84, 84]) obs = self.env.reset() # frameの先頭に最新のobsを格納 current_obs[:, :1] = obs storage = RolloutStorage(NUM_ADVANCED_STEP, NUM_PROCESSES, self.obs_shape, current_obs) for j in tqdm(range(NUM_UPDATES)): for step in range(NUM_ADVANCED_STEP): #with torch.no_grad(): _, cpu_actions = self.actor_critic.predict( storage.observations[step] / 255) action = np.argmax(np.array( [np.random.multinomial(1, x) for x in cpu_actions]), axis=1) # obs size:(16, 1, 84, 84) obs, reward, done, info = self.env.step(action) reward = reward.reshape(-1, 1) episode_rewards += reward final_rewards[done] = episode_rewards[done] episode_rewards[done] = 0 # 現在の状態をdone時には全部0にする current_obs[done] = 0 # frameをstackする current_obs[:, 1:] = current_obs[:, :-1] # 2~4番目に1~3番目を上書き current_obs[:, :1] = obs # 1番目に最新のobsを格納 # メモリオブジェクトに今stepのtransitionを挿入 storage.insert(current_obs, action, reward, done) # advancedした最終stepの状態から予想する状態価値を計算 #with torch.no_grad(): input_obs = storage.observations[-1] / 255 next_value, _ = self.actor_critic.predict(input_obs) # 全stepの割引報酬和returnsを計算 storage.compute_discounted_rewards(next_value) # ネットワークとstorageの更新 self.global_brain.update(storage) storage.after_update() # ログ:途中経過の出力 if j % 100 == 0: print( "finished frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j * NUM_PROCESSES * NUM_ADVANCED_STEP, final_rewards.mean(), np.median(final_rewards), final_rewards.min(), final_rewards.max())) # 結合パラメータの保存 if j % 12500 == 0: self.actor_critic.save('weight_' + str(j) + '.pth') # 実行ループの終了 self.actor_critic.save('weight_end.pth')
class A2C: def __init__(self, env_name="BipedalWalker-v2", num_steps=5, num_workers=10, num_updates=10000, log_frequency=10, use_gae=True, gamma=0.99, tau=0.95, entropy_coef=0.01): observation_space, action_space = get_env_info(env_name) self.num_steps = num_steps self.num_updates = num_updates self.log_frequency = log_frequency self.use_gae = use_gae self.gamma = gamma self.tau = tau self.entropy_coef = entropy_coef self.max_grad_norm = 0.5 self.simulator = RolloutCollector(env_name, num_workers) self.eval_env = gym.make(env_name) self.obs_dim, self.action_dim = observation_space.shape[ 0], action_space.shape[0] self.storage = RolloutStorage(num_steps, num_workers, observation_space.shape, action_space) self.policy = Actor(self.obs_dim, self.action_dim) self.V = Critic(self.obs_dim) self.actor_optimizer = optim.Adam(self.policy.parameters(), lr=5e-4) self.critic_optimizer = optim.Adam(self.V.parameters(), lr=5e-4) # track statistics self.episode_count = 0 def get_actions(self, obs_n): with torch.no_grad(): obs_batch = torch.FloatTensor(np.stack(obs_n)) dist = self.policy(obs_batch) action_sample = dist.sample() values = self.V(obs_batch) action_n = [ action_sample[i].numpy() for i in range(len(action_sample)) ] return action_n, action_sample, values def update_storage(self, obs, actions, rewards, values, dones): self.episode_count += torch.sum(dones).item() masks = 1 - dones self.storage.insert(obs, actions, values, rewards, masks) def set_initial_observations(self, observations): self.storage.obs[0].copy_(observations) def compute_advantages(self): advantages = self.storage.returns[:-1] - self.storage.values[:-1] # standardize the advantages advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) return advantages def update(self): with torch.no_grad(): next_value = self.V(self.storage.obs[-1]) self.storage.compute_returns(next_value, self.use_gae, self.gamma, self.tau) self.storage.returns.mul_(0.1) advantages = self.compute_advantages() obs_batch, actions_batch, values_batch, return_batch, adv_targ = self.storage.build_batch( advantages) # Update the policy self.actor_optimizer.zero_grad() action_dist = self.policy(obs_batch) action_log_probs = action_dist.log_prob(actions_batch) objective = torch.mean(adv_targ * action_log_probs) policy_loss = -objective # compute the value loss self.critic_optimizer.zero_grad() value_loss = F.mse_loss(self.V(obs_batch), return_batch) # compute other losses entropy_loss = -torch.mean(action_dist.entropy()) # sum the losses, backprop, and step net_loss = policy_loss + value_loss + self.entropy_coef * entropy_loss net_loss.backward() nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) nn.utils.clip_grad_norm_(self.V.parameters(), self.max_grad_norm) self.critic_optimizer.step() self.actor_optimizer.step() return value_loss.detach().item( ), -policy_loss.detach().item(), -entropy_loss.detach().item() def evaluate(self, n=20, render=False): env = self.eval_env action_bounds = [env.action_space.low, env.action_space.high] all_rewards = [] for i in range(n): episode_rewards = [] state = env.reset() terminal = False while not terminal: dist = self.policy(torch.FloatTensor(state).view(1, -1)) action = dist.sample().numpy().reshape(-1) action = np.clip(action, action_bounds[0], action_bounds[1]) next_state, reward, terminal, info = env.step(action) episode_rewards.append(reward) state = next_state if render: fps = 8.0 env.render() time.sleep(1 / fps) all_rewards.append(np.sum(episode_rewards)) all_rewards = np.array(all_rewards) env.reset() return all_rewards def __iter__(self): obs_n = self.simulator.reset() for u in range(self.num_updates): self.set_initial_observations(torch.FloatTensor(np.stack(obs_n))) for t in range(self.num_steps): # Compute actions using policy given latest observation action_n, actions, values = self.get_actions(obs_n) # Give action to each worker and take an environment step obs_n, reward_n, done_n = self.simulator.step(action_n) observations = torch.FloatTensor(np.stack(obs_n)) rewards = torch.FloatTensor(np.vstack(reward_n)) dones = torch.FloatTensor(np.vstack(done_n)) # Update the storage self.update_storage(observations, actions, rewards, values, dones) value_loss, objective, mean_policy_entropy = self.update() self.storage.after_update() if (u + 1) % self.log_frequency == 0: eval_episode_returns = self.evaluate() yield self.episode_count, eval_episode_returns, value_loss, objective, mean_policy_entropy
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 64 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = False # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:1" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) if args.test_mario: self.load_model(os.path.join('mario.pt')) print('finish model loading ...') self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} rewards = self.rollouts.rewards obs = self.rollouts.obs hiddens = self.rollouts.hiddens masks = self.rollouts.masks actions = self.rollouts.actions preds = self.rollouts.value_preds # 5 x 16 x 1 Vt = preds[:-1] Vt_1 = self.gamma * preds[1:] * masks[:-1] # 5 x 16 from torch.autograd import Variable Advantage = Variable((rewards - (Vt-Vt_1)), requires_grad=False) R = Advantage.squeeze(-1) # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) entropys = [] logP = [] Q_values = [] for idx, (ob, hidden, mask) in enumerate(zip(obs, hiddens, masks)): value, action_prob, _ = self.model(ob, hidden, mask) Q_values.append(value) if idx != obs.size(0)-1: m = Categorical(action_prob) logP.append(m.log_prob(actions[idx].squeeze(-1))) entropys.append(torch.mean(m.entropy())) logP = torch.stack(logP,0) action_loss = torch.mean(-R * logP) print(action_loss) Q_values = torch.stack(Q_values, 0) Qt = Q_values[:-1] Qt_1 = rewards + self.gamma * preds[1:] * masks[:-1] mse = torch.nn.MSELoss() value_loss = mse(Qt, Qt_1) print(value_loss) entropys = sum(entropys)/len(entropys) print(entropys) loss = value_loss + action_loss - entropys print(loss) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): from torch.autograd import Variable from torch.distributions import Categorical import numpy as np with torch.no_grad(): # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical values, action_probs, hiddens = self.model(obs, hiddens, masks) m = Categorical(action_probs) actions = m.sample() obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) obs = Variable(torch.FloatTensor(np.float32(obs))) rewards = Variable(torch.FloatTensor(np.float32(rewards))) dones = Variable(torch.FloatTensor(np.float32(dones))).unsqueeze(1) masks = torch.ones(masks.shape) - dones self.rollouts.insert(obs, hiddens, actions.unsqueeze(-1), values, rewards.unsqueeze(-1), masks) def train(self): # logging import logging logging.basicConfig(filename="mario_reward.log", level=logging.INFO) print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step( self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: logging.info("{},{}".format(total_steps, avg_reward)) print('Steps: %d/%d | Avg reward: %f'% (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path, map_location=torch.device('cpu')) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action from torch.autograd import Variable observation = Variable(torch.from_numpy(observation).float().unsqueeze(0)).to(self.device) value, action_prob, hidden = self.model(observation, observation, observation) m = Categorical(action_prob) action = torch.argmax(m.probs).data.tolist() return action
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} embed() # running_add = next_value[-1] actions, policies, values, returns, advantages = process_rollout(args, steps, cuda) for step in range(self.update_freq): # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) loss = actor_loss.mean() + 0.5 * critic_loss - self.entropy_weight * entropy.mean() # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical values, action_probs, hiddens = self.model(obs, hiddens, masks) m = Categorical(action_probs) actions = m.sample() obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) obs = torch.Tensor(obs) rewards = torch.Tensor(rewards) masks = torch.Tensor(1-dones) self.rollouts.insert(obs, hiddens, actions.unsqueeze(1), values, rewards.unsqueeze(1), masks.unsqueeze(1)) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step( self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f'% (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action embed() # state = torch.from_numpy(state).float().unsqueeze(0) # state = state.cuda() if use_cuda else state # self.model(state) # probs = self.model(state) # m = Categorical(probs) # action = m.sample() # self.model.saved_log_probs.append(m.log_prob(action)) # self.saved_actions.append(m.log_prob(action)) return action.item() return action
class AgentMario: #actor agent def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.99 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = False # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py if args.test_mario: self.load_model('./checkpoints/model.pt') self.display_freq = 4000 self.save_freq = 10000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n #print(self.obs_shape) #(4, 84, 84) #print(self.act_shape) #12 self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns #print(self.rollouts.obs.size()) #torch.Size([6, 16, 4, 84, 84]) obs_shape = self.rollouts.obs.size()[2:] #print(obs_shape) #torch.Size([4, 84, 84]) #print(self.rollouts.actions.size()) #torch.Size([5, 16, 1]) action_shape = self.rollouts.actions.size()[-1] #print(action_shape) #1 num_steps, num_processes, _ = self.rollouts.rewards.size() #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/algo/a2c_acktr.py line 38-43 #input() # R_t = reward_t + gamma * R_{t+1} discounted_return = torch.zeros(self.update_freq, self.n_processes, 1).to(self.device) #print(self.rollouts.rewards) for t in range(self.update_freq - 1, -1, -1): discounted_return[t] = self.rollouts.rewards[ t] + self.gamma * self.rollouts.value_preds[t + 1] #print(t) #print(self.rollouts.masks[t]) #print(self.rollouts.obs[:-1]) # [:-1] means don't take the last element #print(self.rollouts.obs[:-1].shape)#torch.Size([5, 16, 4, 84, 84]) # print(self.rollouts.obs[:-1].view(-1, *obs_shape).shape)# torch.Size([80, 4, 84, 84]) n_steps*n_processes, 4, 84, 84 #print(self.rollouts.hiddens[0].shape)#torch.Size([16, 512]) #print(self.rollouts.hiddens[0].view(-1, self.model.hidden_size).shape) #torch.Size([16, 512]) #print(self.rollouts.masks[:-1].view(-1, 1).shape) #torch.Size([80, 1]) values, action_probs, hiddens = self.model( self.rollouts.obs[:-1].view(-1, *obs_shape), self.rollouts.hiddens[0].view(-1, self.model.hidden_size), self.rollouts.masks[:-1].view(-1, 1)) #print(values.shape) #torch.Size([5, 16, 1]) #print(action_probs.shape) #torch.Size([5, 16, 12]) #print(hiddens.shape) #torch.Size([16, 512]) values = values.view(num_steps, num_processes, 1) action_probs = action_probs.view(num_steps, num_processes, -1) #print(action_probs) #print(action_probs.gather(2 ,self.rollouts.actions)) #print(action_probs.gather(2 ,self.rollouts.actions).shape) #torch.Size([5, 16, 1]) #m=Categorical(action_probs) action_probs = action_probs.gather(2, self.rollouts.actions) #print(m) #print(self.rollouts.actions) #print(action_probs) action_log_probs = action_probs.log() #action_log_probs = m.log_prob(self.rollouts.actions.view(-1, action_shape)) #print(action_log_probs) #print(action_log_probs.shape) #torch.Size([5, 16, 1]) #input() #deal with self.rollouts.actions later! #=self.model(self.rollouts.obs) #print(self.rollouts.rewards.shape) #torch.Size([5, 16, 1]) #print(self.rollouts.value_preds.shape)#torch.Size([6, 16, 1]) advantages = discounted_return - values #not so sure, advantage= r_t+gamma* V(s_t+1) - V(s_t) ????? #print(advantages) #print(advantages.shape) #torch.Size([5, 16, 1]) #print(self.rollouts.action_log_probs.shape) #torch.Size([5, 16, 1]) #input() #self.gamma* # TODO: #value loss is the critic loss; action loss is the actor loss # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration #use output entropy as regularization for pi(s) # loss = value_loss + action_loss (- entropy_weight * entropy) #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 260-267 critic_loss = advantages.pow(2).mean() #print(critic_loss.grad) #print(critic_loss) #tensor(1.2946, device='cuda:0', grad_fn=<MeanBackward1>) #print(critic_loss.shape) #torch.Size([]) #https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/tree/master/a2c_ppo_acktr -->USEFUL actor_loss = -(advantages * action_log_probs).mean() #print(actor_loss.grad) #print(actor_loss) #tensor(1.1621, device='cuda:0', grad_fn=<NegBackward>) #print(actor_loss.shape) #torch.Size([]) #input() loss = actor_loss + critic_loss #print(loss) #tensor(2.4567, device='cuda:0', grad_fn=<AddBackward0>) #print(loss.shape) #input() # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): #_step is just 1 step with torch.no_grad(): #16 is n_processes, meaning 16 workers, means batch_size is 16(?) #print("obs.shape", obs.shape) #torch.Size([16, 4, 84, 84]) #print(hiddens.shape) #torch.Size([16, 512]) #print(masks.shape) #torch.Size([16, 1]) #self.model has 3 inputs #I think we should for loop 16 times to get the state of each worker #which is WRONG! #for i in range(self.n_processes): values, action_probs, hiddens = self.model(obs, hiddens, masks) #values : V(st) obs: st #print(values.shape) # #print(hiddens.shape) #print(action_probs) #torch.Size([1, 16, 12]) #print(action_probs.shape) #torch.Size([16, 12]) #action_probs means F.softmax(policy) m = Categorical(action_probs) #print(m) #Categorical(probs: torch.Size([16, 12])) actions = m.sample() #print(m.log_prob(actions).shape) #input() action_log_probs = m.log_prob(actions).unsqueeze(1) #print(m.log_prob(actions)) #print(m.log_prob(actions).shape) #torch.Size([1, 16]) #input() #print(actions)#tensor([[9, 4, 8, 6, 4, 3, 9, 3, 0, 3, 5, 5, 1, 0, 2, 5]], device='cuda:0') #print(actions.shape) #torch.Size([16]) actions = actions.squeeze(0) #print(actions.cpu().numpy()) #[ 0 0 1 4 4 2 8 8 0 4 7 7 6 11 9 3] #input() #if you don't use recurrent, you don't need hidden and masks #values, action_provs, hiddens =self.model(obs, hiddens, masks) #actions=self.make_actions(obs) # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 256-257 #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/main.py line 113~132 obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) #obs here is s_t+1 #the step you're calling here is in shmem_vec_env.py step_async #you are inputing 16 actions to 16 environments #print(dones) #[False False False False False False False False False False False False # False False False False] #print(1-dones) #[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] #print(infos) #input() #rewards : rt, truly obtain when taking actions at values = values.squeeze(0) actions = actions.unsqueeze(1) obs = torch.from_numpy(obs) rewards = torch.from_numpy(rewards).unsqueeze(1) #print(rewards.shape) masks = torch.from_numpy(1 - dones).unsqueeze(1) # TODO: self.rollouts.insert(obs, hiddens, actions, action_log_probs, values, rewards, masks) # Store transitions (obs: s_t+1, hiddens, actions:a_t , values: V(s_t), rewards: r_t, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) #print(obs.shape) #torch.Size([16, 4, 84, 84]) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) #print(obs.shape) #torch.Size([16, 4, 84, 84]) #print(self.rollouts.obs.shape) #torch.Size([6, 16, 4, 84, 84]) # 6 is n_steps+1 --> see ../a2c/storage.py while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): #print(r) #print(m) if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() #update here total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): print("Save the model to ", self.save_dir) torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): print("Load the model from ", path) self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action #self.load_model("./checkpoints/model.pt") #load the model somewhere else! #print(observation.shape) #(4, 84, 84) #print(observation) observation = torch.from_numpy(observation).to( self.device).unsqueeze(0) #when do we call this function??? -->../test/py line 41 will call this function #you also need to differentiate test=True and test=False #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 170 #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/evaluation.py line 20-31 eval_recurrent_hidden_states = torch.zeros(self.n_processes, self.model.hidden_size, device=self.device) eval_masks = torch.zeros(self.n_processes, 1, device=self.device) _, action_probs, _ = self.model(observation, eval_recurrent_hidden_states, eval_masks) #print(action_probs) #print(action_probs.shape) #torch.Size([1, 12]) #print(action_probs.max(1)[1]) #print(action_probs.max(1)[1].item()) action = action_probs.max(1)[1].item() #print(action) return action
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 6e6 self.grad_norm = 0.5 self.entropy_weight = 0.05 if args.test_mario: self.load_model('./checkpoints/model.pt') ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} for step in reversed(range(self.rollouts.rewards.size(0))): self.rollouts.returns[step] = self.rollouts.returns[step+1] * \ self.gamma * self.rollouts.masks[step+1] + self.rollouts.rewards[step] # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) obs_shape = self.rollouts.obs.size()[2:] action_shape = self.rollouts.actions.size()[-1] num_steps, num_processes, _ = self.rollouts.rewards.size() values, action_probs, hiddens = self.model( self.rollouts.obs[:-1].view(-1, *obs_shape), self.rollouts.hiddens[0].view(-1, 512), self.rollouts.masks[:-1].view(-1, 1)) m = Categorical(action_probs) log_probs = m.log_prob(self.rollouts.actions.view(-1)) entropys = m.entropy().mean() values = values.view(num_steps, num_processes, 1) log_probs = log_probs.view(num_steps, num_processes, 1) advantages = self.rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * log_probs).mean() loss = (value_loss + action_loss) - (entropys * self.entropy_weight) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): values, action_probs, hiddens = self.model(obs, hiddens, masks) m = Categorical(action_probs) actions = m.sample() # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones]) obs = torch.from_numpy(obs).to(self.device) rewards = torch.from_numpy(rewards).unsqueeze(1).to(self.device) actions = actions.unsqueeze(1) self.rollouts.insert(obs, hiddens, actions, values, rewards, masks) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) x_value = [] y_value = [] while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) x_value.append(total_steps) y_value.append(avg_reward) if total_steps % self.save_freq == 0: self.save_model('model.pt') # if avg_reward > 5000: # self.save_model('model.pt') # x_value.append(total_steps) # y_value.append(avg_reward) # break if total_steps >= self.max_steps: break self.save_curve(x_value, y_value, 'mario_curve') # def save_curve(self, x_values, y_values, title): # # tmp = {title: # { # 'x': x_values, # 'y': y_values # } # } # # if os.path.isfile('./mario.json'): # with open('mario.json', 'r') as f: # file = json.load(f) # file.update(tmp) # with open('mario.json', 'w') as f: # json.dump(file, f) # else: # with open('mario.json', 'w') as f: # json.dump(tmp, f) def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action if test: # self.load_model('./checkpoints/model.pt') with torch.no_grad(): obs = torch.from_numpy(observation).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) _, action_probs, self.rollouts.hiddens[0] = self.model( self.rollouts.obs[0], self.rollouts.hiddens[0], self.rollouts.masks[0]) m = Categorical(action_probs) action = m.sample().cpu().numpy() return action[0]
class AgentA2C: def __init__(self, env, args): self.use_gae = True self.use_standard = False # Hyperparameters self.lr = 7e-4 self.gamma = 0.90 self.tau = 0.95 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.clip_param = 0.2 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = False # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 20000 if args.test_a2c: if args.model_path == None: raise Exception('give --model_path') else: if args.folder_name == None: raise Exception('give --folder_name') self.model_dir = os.path.join('./model', args.folder_name) if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.plot = {'reward': []} torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent) self.ppo_epochs = 4 self.ppo_batch_size = 5 if args.test_a2c: self.load_model(args.model_path) self.model = self.model.to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def ppo_iter(self, mini_batch_size, states, hiddens, masks, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], hiddens[rand_ids, :], masks[ rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def _update(self): # R_t = reward_t + gamma * R_{t+1} with torch.no_grad(): Return = self.model.get_estimate_returns(self.rollouts.obs[-1], self.rollouts.hiddens[-1], self.rollouts.masks[-1]) self.rollouts.value_preds[-1].copy_(Return) self.rollouts.returns[-1].copy_(Return * self.rollouts.masks[-1]) if self.use_standard: self.rollouts.rewards = ( self.rollouts.rewards - self.rollouts.rewards.mean()) / self.rollouts.rewards.std() if self.use_gae: gae = 0 for r in reversed(range(len(self.rollouts.rewards))): delta = self.rollouts.rewards[r] \ + self.gamma * self.rollouts.value_preds[r+1] * self.rollouts.masks[r+1] \ - self.rollouts.value_preds[r] gae = delta + self.gamma * self.tau * self.rollouts.masks[ r + 1] * gae Return = gae + self.rollouts.value_preds[r] self.rollouts.returns[r].copy_(Return) else: for r in reversed(range(len(self.rollouts.rewards))): Return = self.rollouts.rewards[ r] + self.gamma * Return * self.rollouts.masks[r + 1] self.rollouts.returns[r].copy_(Return) # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) #action_probs = self.rollouts.action_probs.view(self.n_processes * self.update_freq, -1) #est_returns = self.rollouts.value_preds[:-1].view(self.n_processes * self.update_freq, -1) with torch.no_grad(): est_returns, log_probs, _ = self.model( self.rollouts.obs[:-1].view( self.n_processes * self.update_freq, *self.obs_shape), self.rollouts.hiddens[:-1].view( self.n_processes * self.update_freq, -1), self.rollouts.masks[:-1].view( self.n_processes * self.update_freq, -1), ) states = self.rollouts.obs[:-1] hiddens = self.rollouts.hiddens[:-1] masks = self.rollouts.masks[:-1] actions = self.rollouts.actions returns = self.rollouts.returns[:-1] est_returns = est_returns.view(self.update_freq, self.n_processes, -1) log_probs = log_probs.gather( 1, actions.view(self.n_processes * self.ppo_batch_size, -1)).view(self.update_freq, self.n_processes, -1) advantages = returns - est_returns all_loss = [] for _ in range(self.ppo_epochs): for state, hidden, mask, action, old_log_probs, return_, advantage in self.ppo_iter( self.ppo_batch_size, states, hiddens, masks, actions, log_probs, returns, advantages): action = action.view(self.n_processes * self.ppo_batch_size, -1) return_ = return_.view(self.n_processes * self.ppo_batch_size, -1) state = state.view(self.n_processes * self.ppo_batch_size, *self.obs_shape) hidden = hidden.view(self.n_processes * self.ppo_batch_size, -1) mask = mask.view(self.n_processes * self.ppo_batch_size, -1) old_log_probs = old_log_probs.view( self.n_processes * self.ppo_batch_size, -1) advantage = advantage.view( self.n_processes * self.ppo_batch_size, -1) value, new_log_probs, _ = self.model(state, hidden, mask) ratio = (new_log_probs.gather(1, action).log() - old_log_probs.log()).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantage # action loss (Policy) action_loss = -torch.min(surr1, surr2).mean() # value loss (DQN) value_loss = (return_ - value).pow(2).mean() # entropy entropy = (new_log_probs * new_log_probs.log()).sum(1).mean() # loss loss = 0.5 * value_loss + action_loss - self.entropy_weight * entropy # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() all_loss.append(loss.item()) # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return sum(all_loss) / len(all_loss) def _step(self, obs, hiddens, masks): with torch.no_grad(): values, action_probs, hiddens = self.model(obs, hiddens, masks) actions = Categorical(action_probs.detach()).sample() # Sample actions from the output distributions obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) obs = torch.from_numpy(obs) rewards = torch.from_numpy(rewards).unsqueeze(1) masks = torch.from_numpy(1 - (dones)).unsqueeze(1) actions = actions.unsqueeze(1) self.rollouts.insert( obs, #next hiddens, #next actions, #now action_probs, #now values, #now rewards, #now masks) #next # Store transitions (obs, hiddens, actions, values, rewards, masks) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 best_reward = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) self.plot['reward'].append(avg_reward) print('Steps: %d/%d | Avg reward: %f | Loss: %f' % (total_steps, self.max_steps, avg_reward, loss), end='\r') if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: with open(os.path.join(self.model_dir, 'plot.json'), 'w') as f: json.dump(self.plot, f) #if int(avg_reward) > best_reward: best_reward = int(avg_reward) self.save_model( os.path.join( self.model_dir, 's{}_r{}_model.pt'.format(total_steps, best_reward))) if total_steps >= self.max_steps: break def save_model(self, path): torch.save( { 'model': self.model, 'optimizer': self.optimizer.state_dict() }, path) def load_model(self, path): print('Load model from', path) self.model = torch.load(path)['model'] def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): obs = torch.FloatTensor([observation]).to(self.device) #self.rollouts.obs[0].copy_(obs) #self.rollouts.to(self.device) with torch.no_grad(): action_probs, _ = self.model.get_action_probs(obs, None, None) action = action_probs.max(1)[1].item() return action