Example #1
0
def evaluate_actor_critic(params, path):
    model = ActorCritic(params.stack_size, get_action_space())
    model.load_state_dict(torch.load(path))
    model.eval()

    game = params.game
    env = gym.make(game)
    env_wrapper = EnvironmentWrapper(env, params.stack_size)

    total_reward = 0
    num_of_episodes = 100

    for episode in range(num_of_episodes):
        state = env_wrapper.reset()
        state = torch.Tensor([state])
        done = False
        score = 0
        while not done:
            probs, _, _ = model(state)
            action = get_actions(probs)
            state, reward, done = env_wrapper.step(action[0])
            state = torch.Tensor([state])
            score += reward
            env_wrapper.render()
        print('Episode: {0} Score: {1:.2f}'.format(episode, score))
        total_reward += score
    return total_reward / num_of_episodes
Example #2
0
    def run(self):
        num_of_updates = self.params.num_of_steps / self.params.steps_per_update
        self.current_observation = torch.Tensor([self.environment.reset()])

        for update in range(int(num_of_updates)):
            self.storage.reset_storage()
            # synchronize with global model
            R = 0
            self.model.load_state_dict(self.global_model.state_dict())
            for step in range(self.params.steps_per_update):
                probs, log_probs, value = self.model(self.current_observation)
                action = get_actions(probs)[0]
                action_log_prob, entropy = self.compute_action_log_and_entropy(
                    probs, log_probs)

                state, reward, done = self.environment.step(action)
                R += reward
                if done:
                    state = self.environment.reset()
                done = torch.Tensor([done])
                self.current_observation = torch.Tensor([state])
                self.storage.add(step, value, reward, action_log_prob, entropy,
                                 done)

            _, _, last_value = self.model(self.current_observation)
            expected_reward = self.storage.compute_expected_reward(
                last_value, self.params.discount_factor)
            advantages = torch.tensor(expected_reward) - self.storage.values
            value_loss = advantages.pow(2).mean()
            if self.params.use_gae:
                gae = self.storage.compute_gae(last_value,
                                               self.params.discount_factor,
                                               self.params.gae_coef)
                policy_loss = -(torch.tensor(gae) *
                                self.storage.action_log_probs).mean()
            else:
                policy_loss = -(advantages *
                                self.storage.action_log_probs).mean()

            self.optimizer.zero_grad()
            loss = policy_loss - self.params.entropy_coef * self.storage.entropies.mean() + \
                self.params.value_loss_coef * value_loss
            #   loss = torch.abs(loss)
            loss.backward()
            nn.utils.clip_grad_norm(self.model.parameters(),
                                    self.params.max_norm)
            self._share_gradients()
            self.optimizer.step()

            if update % 20 == 0:
                print('Process: {}. Update: {}. Loss: {}. Score: {}'.format(
                    self.process_num, update, loss, R))
Example #3
0
def actor_critic_inference(params, path):
    model = ActorCritic(params.stack_size, get_action_space())
    model.load_state_dict(torch.load(path))
    model.eval()

    game = params.game
    env = gym.make(game)
    env_wrapper = EnvironmentWrapper(env, params.stack_size)

    state = env_wrapper.reset()
    state = torch.Tensor([state])
    done = False
    total_score = 0
    while not done:
        probs, _, _ = model(state)
        action = get_actions(probs)
        print(action)
        state, reward, done = env_wrapper.step(action[0])
        state = torch.Tensor([state])
        total_score += reward
        env_wrapper.render()
    return total_score