def evaluate_actor_critic(params, path): model = ActorCritic(params.stack_size, get_action_space()) model.load_state_dict(torch.load(path)) model.eval() env = gym.make('CarRacing-v0') env_wrapper = EnvironmentWrapper(env, params.stack_size) total_reward = 0 num_of_episodes = 100 for episode in range(num_of_episodes): state = env_wrapper.reset() state = torch.Tensor([state]) done = False score = 0 while not done: probs, _, _ = model(state) action = get_actions(probs) state, reward, done = env_wrapper.step(action[0]) print(probs.detach().numpy(), "\n", action, reward) state = torch.Tensor([state]) score += reward env_wrapper.render() print('Episode: {0} Score: {1:.2f}'.format(episode, score)) total_reward += score return total_reward / num_of_episodes
def run(self): num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observation = torch.Tensor([self.environment.reset()]) for update in range(int(num_of_updates)): self.storage.reset_storage() # synchronize with global model self.model.load_state_dict(self.global_model.state_dict()) for step in range(self.params.steps_per_update): probs, log_probs, value = self.model(self.current_observation) action = get_actions(probs)[0] action_log_prob, entropy = self.compute_action_log_and_entropy( probs, log_probs) state, reward, done = self.environment.step(action) if done: state = self.environment.reset() done = torch.Tensor([done]) self.current_observation = torch.Tensor([state]) self.storage.add(step, value, reward, action_log_prob, entropy, done) _, _, last_value = self.model(self.current_observation) expected_reward = self.storage.compute_expected_reward( last_value, self.params.discount_factor) advantages = torch.tensor(expected_reward) - self.storage.values value_loss = advantages.pow(2).mean() if self.params.use_gae: gae = self.storage.compute_gae(last_value, self.params.discount_factor, self.params.gae_coef) policy_loss = -(torch.tensor(gae) * self.storage.action_log_probs).mean() else: policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss # loss = torch.abs(loss) loss.backward() nn.utils.clip_grad_norm(self.model.parameters(), self.params.max_norm) self._share_gradients() self.optimizer.step() if update % 20 == 0: print('Process: {}. Update: {}. Loss: {}'.format( self.process_num, update, loss))
def run(self): # num of updates per environment num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observations = self.parallel_environments.reset() print(self.current_observations.size()) for update in range(int(num_of_updates)): self.storage.reset_storage() for step in range(self.params.steps_per_update): probs, log_probs, value = self.actor_critic( self.current_observations) actions = get_actions(probs) action_log_probs, entropies = self.compute_action_logs_and_entropies( probs, log_probs) states, rewards, dones = self.parallel_environments.step( actions) rewards = rewards.view(-1, 1) dones = dones.view(-1, 1) self.current_observations = states self.storage.add(step, value, rewards, action_log_probs, entropies, dones) _, _, last_values = self.actor_critic(self.current_observations) expected_rewards = self.storage.compute_expected_rewards( last_values, self.params.discount_factor) advantages = torch.tensor(expected_rewards) - self.storage.values value_loss = advantages.pow(2).mean() policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss loss.backward(retain_graph=True) nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.params.max_norm) self.optimizer.step() if update % 300 == 0: torch.save(self.actor_critic.state_dict(), self.model_path) if update % 100 == 0: print('Update: {}. Loss: {}'.format(update, loss))
def actor_critic_inference(params, path): model = ActorCritic(params.stack_size, get_action_space()) model.load_state_dict(torch.load(path)) model.eval() env = gym.make('CarRacing-v0') env_wrapper = EnvironmentWrapper(env, params.stack_size) state = env_wrapper.reset() state = torch.Tensor([state]) done = False total_score = 0 while not done: probs, _, _ = model(state) action = get_actions(probs) print(action) state, reward, done = env_wrapper.step(action[0]) state = torch.Tensor([state]) total_score += reward env_wrapper.render() return total_score
def run(self): num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observation = torch.Tensor([self.environment.reset()]) reward_episode = 0 #NEW for update in range(int(num_of_updates)): self.storage.reset_storage() # synchronize with global model self.model.load_state_dict(self.global_model.state_dict()) for step in range(self.params.steps_per_update): probs, log_probs, value = self.model(self.current_observation) action = get_actions(probs)[0] action_log_prob, entropy = self.compute_action_log_and_entropy( probs, log_probs) state, reward, done = self.environment.step(action) reward_episode += reward # NEW if done: self.log_reward = np.append(self.log_reward, reward_episode) # NEW print('Process: {}. Episode {} score: {}.'.format( self.process_num, len(self.log_reward) - 1, self.log_reward[-1])) # NEW reward_episode = 0 # NEW state = self.environment.reset() done = torch.Tensor([done]) self.current_observation = torch.Tensor([state]) self.storage.add(step, value, reward, action_log_prob, entropy, done) _, _, last_value = self.model(self.current_observation) expected_reward = self.storage.compute_expected_reward( last_value, self.params.discount_factor) advantages = torch.tensor(expected_reward) - self.storage.values value_loss = advantages.pow(2).mean() if self.params.use_gae: gae = self.storage.compute_gae(last_value, self.params.discount_factor, self.params.gae_coef) policy_loss = -(torch.tensor(gae) * self.storage.action_log_probs).mean() else: policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss if self.autosave: # NEW self.log_tmp = np.append(self.log_tmp, loss.detach().numpy()) # NEW loss.backward() nn.utils.clip_grad_norm(self.model.parameters(), self.params.max_norm) self._share_gradients() self.optimizer.step() #NEW if update % (int(num_of_updates / 50)) == 0: self.lr *= 0.85 self.optimizer = Adam(self.global_model.parameters(), lr=self.lr) print('Process: {}. Learning rate: {}.'.format( self.process_num, self.lr)) if update % 60 == 0: print('Process: {}. Update: {}. Loss: {}'.format( self.process_num, update, loss)) # NEW if self.autosave: torch.save(self.global_model.state_dict(), 'models/a3c{}.pt'.format(update)) self.log_loss.append(np.mean(self.log_tmp)) self.log_tmp = np.array([]) print("LOSS OF 60 UPDATEs: {}".format(self.log_loss[-1]))