def evaluate_actor_critic(params, path): model = ActorCritic(params.stack_size, get_action_space()) model.load_state_dict(torch.load(path)) model.eval() game = params.game env = gym.make(game) env_wrapper = EnvironmentWrapper(env, params.stack_size) total_reward = 0 num_of_episodes = 100 for episode in range(num_of_episodes): state = env_wrapper.reset() state = torch.Tensor([state]) done = False score = 0 while not done: probs, _, _ = model(state) action = get_actions(probs) state, reward, done = env_wrapper.step(action[0]) state = torch.Tensor([state]) score += reward env_wrapper.render() print('Episode: {0} Score: {1:.2f}'.format(episode, score)) total_reward += score return total_reward / num_of_episodes
def run(self): num_of_updates = self.params.num_of_steps / self.params.steps_per_update self.current_observation = torch.Tensor([self.environment.reset()]) for update in range(int(num_of_updates)): self.storage.reset_storage() # synchronize with global model R = 0 self.model.load_state_dict(self.global_model.state_dict()) for step in range(self.params.steps_per_update): probs, log_probs, value = self.model(self.current_observation) action = get_actions(probs)[0] action_log_prob, entropy = self.compute_action_log_and_entropy( probs, log_probs) state, reward, done = self.environment.step(action) R += reward if done: state = self.environment.reset() done = torch.Tensor([done]) self.current_observation = torch.Tensor([state]) self.storage.add(step, value, reward, action_log_prob, entropy, done) _, _, last_value = self.model(self.current_observation) expected_reward = self.storage.compute_expected_reward( last_value, self.params.discount_factor) advantages = torch.tensor(expected_reward) - self.storage.values value_loss = advantages.pow(2).mean() if self.params.use_gae: gae = self.storage.compute_gae(last_value, self.params.discount_factor, self.params.gae_coef) policy_loss = -(torch.tensor(gae) * self.storage.action_log_probs).mean() else: policy_loss = -(advantages * self.storage.action_log_probs).mean() self.optimizer.zero_grad() loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \ self.params.value_loss_coef * value_loss # loss = torch.abs(loss) loss.backward() nn.utils.clip_grad_norm(self.model.parameters(), self.params.max_norm) self._share_gradients() self.optimizer.step() if update % 20 == 0: print('Process: {}. Update: {}. Loss: {}. Score: {}'.format( self.process_num, update, loss, R))
def actor_critic_inference(params, path): model = ActorCritic(params.stack_size, get_action_space()) model.load_state_dict(torch.load(path)) model.eval() game = params.game env = gym.make(game) env_wrapper = EnvironmentWrapper(env, params.stack_size) state = env_wrapper.reset() state = torch.Tensor([state]) done = False total_score = 0 while not done: probs, _, _ = model(state) action = get_actions(probs) print(action) state, reward, done = env_wrapper.step(action[0]) state = torch.Tensor([state]) total_score += reward env_wrapper.render() return total_score