Python QLearner.learn Examples

Programming Language: Python

Namespace/Package Name: qlearn

Class/Type: QLearner

Method/Function: learn

Examples at hotexamples.com: 2

Python QLearner.learn - 2 examples found. These are the top rated real world Python examples of qlearn.QLearner.learn extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

QLearner(6)

learn(2)

iterate(1)

learn2(1)

renameThis(1)

test(1)

Example #1

Show file

File: rl.py Project: eziaeika/100DaysOfMLCode

        gw.plot_policy(utility_grids[:, :, -1])
        plot_convergence(utility_grids, policy_grids)
        plt.show()

    ql = QLearner(num_states=(shape[0] * shape[1]),
                  num_actions=4,
                  learning_rate=0.8,
                  discount_rate=0.9,
                  random_action_prob=0.5,
                  random_action_decay_rate=0.99,
                  dyna_iterations=0)

    start_state = gw.grid_coordinates_to_indices(start)

    iterations = 1000
    flat_policies, flat_utilities = ql.learn(start_state,
                                             gw.generate_experience,
                                             iterations=iterations)

    new_shape = (gw.shape[0], gw.shape[1], iterations)
    ql_utility_grids = flat_utilities.reshape(new_shape)
    ql_policy_grids = flat_policies.reshape(new_shape)
    print('Final result of QLearning:')
    print(ql_policy_grids[:, :, -1])
    print(ql_utility_grids[:, :, -1])

    plt.figure()
    gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1])
    plot_convergence(ql_utility_grids, ql_policy_grids)
    plt.show()

Example #2

Show file

    def solve(self):

        reward_grid = np.zeros(self.shape) + self.default_reward
        reward_grid[self.goal] = self.goal_reward

        coords = zip(*self.traps)
        trap_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords),
                                      shape=self.shape,
                                      dtype=bool).toarray()
        reward_grid[trap_mask] = self.trap_reward

        coords = zip(*self.obstacles)
        obstacle_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords),
                                          shape=self.shape,
                                          dtype=bool).toarray()
        reward_grid[obstacle_mask] = 0

        terminal_mask = np.zeros_like(reward_grid, dtype=np.bool)
        terminal_mask[self.goal] = True
        terminal_mask[trap_mask] = True

        gw = GridWorldMDP(start=self.start,
                          reward_grid=reward_grid,
                          obstacle_mask=obstacle_mask,
                          terminal_mask=terminal_mask,
                          action_probabilities=[
                              (-1, 0.1),
                              (0, 0.8),
                              (1, 0.1),
                          ],
                          no_action_probability=0.0)

        utility_grid = np.zeros(self.shape)
        gw.plot_policy(
            utility_grid, None,
            str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld')

        mdp_solvers = {
            'Value Iteration': gw.run_value_iterations,
            'Policy Iteration': gw.run_policy_iterations
        }

        time_results = []
        steps_results = []
        reward_results = []

        for solver_name, solver_fn in mdp_solvers.items():
            print('Solving {}:'.format(solver_name))

            title = str(self.shape[0]) + 'x' + str(
                self.shape[1]) + ' Gridworld - ' + solver_name
            policy_grids, utility_grids, time_stamps, num_steps, total_reward = solver_fn(
                iterations=self.iterations[0], discount=0.5, title=title)

            a = np.empty(self.iterations[1] - self.iterations[0])

            a.fill(time_stamps[-1])
            time_stamps = np.concatenate((time_stamps, a))
            time_results.append(time_stamps)

            a.fill(num_steps[-1])
            num_steps = np.concatenate((num_steps, a))
            steps_results.append(num_steps)

            a.fill(total_reward[-1])
            total_reward = np.concatenate((total_reward, a))
            reward_results.append(total_reward)

            #print(policy_grids[:, :, -1])
            #print(utility_grids[:, :, -1])

            gw.plot_policy(utility_grids[:, :, -1], None, title)
            plot_convergence(utility_grids, policy_grids, title)
        """for lr in [0.7, 0.8, 0.9]:
          for ra in [0.2, 0.5, 0.8]:
            for e in [.79, .89, .99]:"""

        ql = QLearner(num_states=(self.shape[0] * self.shape[1]),
                      num_actions=4,
                      obstacle_mask=obstacle_mask,
                      terminal_mask=terminal_mask,
                      learning_rate=0.8,
                      discount_rate=0.975,
                      random_action_prob=0.5,
                      random_action_decay_rate=0.89,
                      dyna_iterations=0)

        print('Solving QLearning:')
        start_state = gw.grid_coordinates_to_indices(self.start)

        #title = str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Q Learning - ' + str(lr).replace('.', '') + str(ra).replace('.', '') + str(e).replace('.', '')

        title = str(self.shape[0]) + 'x' + str(
            self.shape[1]) + ' Gridworld - Q Learning'

        iterations = self.iterations[1]
        flat_policies, flat_utilities, time_stamps, num_steps, total_reward = ql.learn(
            start_state,
            gw,
            iterations=iterations,
            title=str(self.shape[0]) + 'x' + str(self.shape[1]) + '/QL/' +
            title)

        new_shape = (gw.shape[0], gw.shape[1], iterations)
        ql_utility_grids = flat_utilities.reshape(new_shape)
        ql_policy_grids = flat_policies.reshape(new_shape)

        time_results.append(time_stamps)
        steps_results.append(num_steps)
        reward_results.append(total_reward)

        #print(ql_policy_grids[:, :, -1])
        #print(ql_utility_grids[:, :, -1])

        gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1],
                       title)
        plot_convergence(ql_utility_grids[:, :, 0:-2],
                         ql_policy_grids[:, :, 0:-2], title)

        plot_time(
            np.array(time_results),
            str(self.shape[0]) + 'x' + str(self.shape[1]) +
            ' Gridworld - Time')
        plot_num_steps(
            np.array(steps_results),
            str(self.shape[0]) + 'x' + str(self.shape[1]) +
            ' Gridworld - # Steps')
        plot_reward(
            np.array(reward_results),
            str(self.shape[0]) + 'x' + str(self.shape[1]) +
            ' Gridworld - Reward')