Beispiel #1
0
    terminal_mask = np.zeros_like(reward_grid, dtype=np.bool)
    terminal_mask[goal_z, goal_y, goal_x] = True
    # terminal_mask[trap] = True

    obstacle_mask = np.zeros_like(reward_grid, dtype=np.bool)
    obstacle_mask[obstacle_z, obstacle_y, obstacle_x] = True

    # pdb.set_trace()
    gw = GridWorldMDP(
        start=start,
        reward_grid=reward_grid,
        obstacle_mask=obstacle_mask,
        terminal_mask=terminal_mask,
        action_probabilities=[
            # up, right, down, left, float, sink, stay
            (0, .8),
            (1, .8),
            (2, .6),
            (3, .6),
            (4, .9),
            (5, .9),
            (6, .2)
        ],
        no_action_probability=0.0)

    # -----Random Policy -----#
    random = random_sovler()

    # # -----For Value Iteration -----#
    # mdp_solvers = {'Value Iteration': gw.run_value_iterations}
    # discount_ =1
    # for solver_name, solver_fn in mdp_solvers.items():
Beispiel #2
0
    reward_grid[goal] = goal_reward
    reward_grid[trap] = trap_reward
    reward_grid[obstacle] = 0

    terminal_mask = np.zeros_like(reward_grid, dtype=np.bool)
    terminal_mask[goal] = True
    terminal_mask[trap] = True

    obstacle_mask = np.zeros_like(reward_grid, dtype=np.bool)
    obstacle_mask[1, 1] = True

    gw = GridWorldMDP(reward_grid=reward_grid,
                      obstacle_mask=obstacle_mask,
                      terminal_mask=terminal_mask,
                      action_probabilities=[
                          (-1, 0.1),
                          (0, 0.8),
                          (1, 0.1),
                      ],
                      no_action_probability=0.0)

    mdp_solvers = {'Value Iteration': gw.run_value_iterations,
                   'Policy Iteration': gw.run_policy_iterations}

    for solver_name, solver_fn in mdp_solvers.items():
        print('Final result of {}:'.format(solver_name))
        policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5)
        print(policy_grids[:, :, -1])
        print(utility_grids[:, :, -1])
        plt.figure()
        gw.plot_policy(utility_grids[:, :, -1])
Beispiel #3
0
    def solve(self):

        reward_grid = np.zeros(self.shape) + self.default_reward
        reward_grid[self.goal] = self.goal_reward

        coords = zip(*self.traps)
        trap_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords),
                                      shape=self.shape,
                                      dtype=bool).toarray()
        reward_grid[trap_mask] = self.trap_reward

        coords = zip(*self.obstacles)
        obstacle_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords),
                                          shape=self.shape,
                                          dtype=bool).toarray()
        reward_grid[obstacle_mask] = 0

        terminal_mask = np.zeros_like(reward_grid, dtype=np.bool)
        terminal_mask[self.goal] = True
        terminal_mask[trap_mask] = True

        gw = GridWorldMDP(start=self.start,
                          reward_grid=reward_grid,
                          obstacle_mask=obstacle_mask,
                          terminal_mask=terminal_mask,
                          action_probabilities=[
                              (-1, 0.1),
                              (0, 0.8),
                              (1, 0.1),
                          ],
                          no_action_probability=0.0)

        utility_grid = np.zeros(self.shape)
        gw.plot_policy(
            utility_grid, None,
            str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld')

        mdp_solvers = {
            'Value Iteration': gw.run_value_iterations,
            'Policy Iteration': gw.run_policy_iterations
        }

        time_results = []
        steps_results = []
        reward_results = []

        for solver_name, solver_fn in mdp_solvers.items():
            print('Solving {}:'.format(solver_name))

            title = str(self.shape[0]) + 'x' + str(
                self.shape[1]) + ' Gridworld - ' + solver_name
            policy_grids, utility_grids, time_stamps, num_steps, total_reward = solver_fn(
                iterations=self.iterations[0], discount=0.5, title=title)

            a = np.empty(self.iterations[1] - self.iterations[0])

            a.fill(time_stamps[-1])
            time_stamps = np.concatenate((time_stamps, a))
            time_results.append(time_stamps)

            a.fill(num_steps[-1])
            num_steps = np.concatenate((num_steps, a))
            steps_results.append(num_steps)

            a.fill(total_reward[-1])
            total_reward = np.concatenate((total_reward, a))
            reward_results.append(total_reward)

            #print(policy_grids[:, :, -1])
            #print(utility_grids[:, :, -1])

            gw.plot_policy(utility_grids[:, :, -1], None, title)
            plot_convergence(utility_grids, policy_grids, title)
        """for lr in [0.7, 0.8, 0.9]:
          for ra in [0.2, 0.5, 0.8]:
            for e in [.79, .89, .99]:"""

        ql = QLearner(num_states=(self.shape[0] * self.shape[1]),
                      num_actions=4,
                      obstacle_mask=obstacle_mask,
                      terminal_mask=terminal_mask,
                      learning_rate=0.8,
                      discount_rate=0.975,
                      random_action_prob=0.5,
                      random_action_decay_rate=0.89,
                      dyna_iterations=0)

        print('Solving QLearning:')
        start_state = gw.grid_coordinates_to_indices(self.start)

        #title = str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Q Learning - ' + str(lr).replace('.', '') + str(ra).replace('.', '') + str(e).replace('.', '')

        title = str(self.shape[0]) + 'x' + str(
            self.shape[1]) + ' Gridworld - Q Learning'

        iterations = self.iterations[1]
        flat_policies, flat_utilities, time_stamps, num_steps, total_reward = ql.learn(
            start_state,
            gw,
            iterations=iterations,
            title=str(self.shape[0]) + 'x' + str(self.shape[1]) + '/QL/' +
            title)

        new_shape = (gw.shape[0], gw.shape[1], iterations)
        ql_utility_grids = flat_utilities.reshape(new_shape)
        ql_policy_grids = flat_policies.reshape(new_shape)

        time_results.append(time_stamps)
        steps_results.append(num_steps)
        reward_results.append(total_reward)

        #print(ql_policy_grids[:, :, -1])
        #print(ql_utility_grids[:, :, -1])

        gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1],
                       title)
        plot_convergence(ql_utility_grids[:, :, 0:-2],
                         ql_policy_grids[:, :, 0:-2], title)

        plot_time(
            np.array(time_results),
            str(self.shape[0]) + 'x' + str(self.shape[1]) +
            ' Gridworld - Time')
        plot_num_steps(
            np.array(steps_results),
            str(self.shape[0]) + 'x' + str(self.shape[1]) +
            ' Gridworld - # Steps')
        plot_reward(
            np.array(reward_results),
            str(self.shape[0]) + 'x' + str(self.shape[1]) +
            ' Gridworld - Reward')