コード例 #1
0
    def improve_policy(self, **kwargs):

        policy_changed = True
        i = 0
        while policy_changed:
            policy_changed = False
            for s in self.action_states:
                self.env.set_state(s)
                best_score = -1
                for a in self.env.actions[s]:
                    reward = self.env.move(a)
                    next_score = reward + self.discount * self.state_values[
                        self.env.current_state()]
                    if next_score > best_score:
                        best_action = a
                        best_score = next_score
                    self.env.undo_move(a)
                if best_action != self.policy[s]:
                    self.policy[s] = best_action
                    policy_changed = True
            self.update_state_value_function(policy='self', **kwargs)

            i += 1
            if i % 1 == 0:
                print("Number of iterations: ", i)
                grid_world.print_policy(self.policy, self.env)
                grid_world.print_values(self.state_values, self.env)
コード例 #2
0
    def run(self, nb_iter=10000, **kwargs):

        while self.cur_episode < nb_iter:
            self.episode_function(**kwargs)
            self.update_explore_threshold()
        grid_world.print_policy(self.policy, self.env)
        state_values = self.compute_state_values_from_action_values()
        grid_world.print_values(state_values, self.env)
コード例 #3
0
    def improve_policy(self, nb_iter=1000, **kwargs):

        for t in range(nb_iter):
            states, actions, rewards = self.perform_episode(**kwargs)
            self.update_state_value_function(states, actions, rewards)
            self.update_policy()
            self.update_explore_threshold(t + 1)
        grid_world.print_policy(self.policy, self.env)
        grid_world.print_values(self.state_values, self.env)
コード例 #4
0
    def perform_value_iteration(self, wind, wind_force=0.5):

        deltas = [1]
        t = 0
        while max(deltas) > self.epsilon:
            deltas = [None] * len(self.action_states)
            for i, s in enumerate(self.action_states):
                best_action_value = -1
                self.env.set_state(s)
                old_value = self.state_values[s]
                self.state_values[s] = 0
                for a in self.env.actions[s]:
                    if wind == 'random':
                        if self.policy[s] == a:
                            p_a = wind_force
                        else:
                            p_a = wind_force * (1 /
                                                (len(self.env.actions[s])) - 1)
                    elif wind == 'right':
                        if a == 'R' and a in self.env.actions[s]:
                            p_a = wind_force
                        else:
                            p_a = 0
                        if self.policy[s] == a:
                            p_a += (1 - wind_force)
                    else:
                        p_a = int(self.policy[s] == a)
                    reward = self.env.move(a)
                    next_state = self.env.current_state()
                    action_value = reward + self.discount * self.state_values[
                        next_state]
                    if action_value > best_action_value:
                        best_action_value = action_value
                        best_action = a
                    self.state_values[s] += p_a * action_value
                    self.env.undo_move(a)
                self.policy[s] = best_action
                deltas[i] = np.abs(self.state_values[s] - old_value)

            t += 1
            print("Number of iterations: ", t)
            grid_world.print_policy(self.policy, self.env)
            grid_world.print_values(self.state_values, self.env)
コード例 #5
0
                    action_value = reward + self.discount * self.state_values[
                        next_state]
                    if action_value > best_action_value:
                        best_action_value = action_value
                        best_action = a
                    self.state_values[s] += p_a * action_value
                    self.env.undo_move(a)
                self.policy[s] = best_action
                deltas[i] = np.abs(self.state_values[s] - old_value)

            t += 1
            print("Number of iterations: ", t)
            grid_world.print_policy(self.policy, self.env)
            grid_world.print_values(self.state_values, self.env)


if __name__ == "__main__":

    optimizer = PolicyOptimizer(environment=grid_world.negative_grid())
    grid_world.print_policy(optimizer.policy, optimizer.env)
    grid_world.print_values(optimizer.state_values, optimizer.env)
    optimizer.perform_value_iteration(wind=None)

    # Windy Gridworld: each action has a 50% chance to fail, another action (chosen at random) is performed instead
    optimizer = PolicyOptimizer(environment=grid_world.negative_grid())
    grid_world.print_policy(optimizer.policy, optimizer.env)
    grid_world.print_values(optimizer.state_values, optimizer.env)
    optimizer.perform_value_iteration(
        wind='right',
        wind_force=0.26)  # .25 is the threshold to switch optimal agency
コード例 #6
0
                                    reversed(states[:-1]),
                                    reversed(states[1:]), reversed(rewards)):
            new_value = self.state_values[s] + 1/(np.log(t+2)) * \
                        (r + self.discount_factor * self.state_values[s_prime] - self.state_values[s])
            deltas[i] = np.abs(new_value - self.state_values[s])
            self.state_values[s] = new_value

    def solve_prediction_problem(self, max_iter=10000):

        state_values = {}
        for t in tqdm(range(max_iter)):
            states, actions, rewards = self.play_game()
            self.update_state_value_function(states, rewards, t)
            if t % 1000 == 0:
                state_values[t] = copy.deepcopy(self.state_values)
        return state_values


if __name__ == "__main__":

    a = Agent(grid_world.standard_grid(), policy='random', discount_factor=1.0)
    state_values = a.solve_prediction_problem()
    for k, v in state_values.items():
        print(k)
        grid_world.print_values(v, a.env)
    a = Agent(grid_world.standard_grid(), policy='win-from-start')
    state_values = a.solve_prediction_problem()
    for k, v in state_values.items():
        print(k)
        grid_world.print_values(v, a.env)