Exemple #1
0
    if first:
      first = False
    else:
      states_actions_returns.append((s, a, G))
    G = r + GAMMA*G
  states_actions_returns.reverse() # we want it to be in order of state visited
  return states_actions_returns


if __name__ == '__main__':
  # use the standard grid again (0 for every step) so that we can compare
  # to iterative policy evaluation
  # grid = standard_grid()
  # try the negative grid too, to see if agent will learn to go past the "bad spot"
  # in order to minimize number of steps
  grid = negative_grid(step_cost=-0.1)

  # print rewards
  print("rewards:")
  print_values(grid.rewards, grid)

  # state -> action
  # initialize a random policy
  policy = {}
  for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

  # initialize Q(s,a) and returns
  Q = {}
  returns = {} # dictionary of state -> list of returns we've received
  states = grid.all_states()
	'''
    p = np.random.random()
    if p > eps:
        return a
    else:
        return np.random.choice(ALL_POSSIBLE_ACTIONS)


if __name__ == '__main__':
    grid_type = input('\nchoose grid type (standard/negative):\n').strip()

    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # display rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)

    # initialize the action-value function and the counts for s and (s, a) pairs:
    Q = {}
    N = {}
    alpha_divisor = {}
    for s in grid.all_states:
        Q[s] = {}
import numpy as np 
import matplotlib.pyplot as plt 
from gridworld import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy


SMALL_ENOUGH = 10e-4
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'R', 'L')

if __name__ == '__main__':

	grid = negative_grid()

	print('rewards:')
	print_values(grid.rewards, grid)

	policy = {}
	for s in grid.actions.keys():
		policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

	# print('Initial policy:')
	# print_policy(policy, grid)

	V = {}
	states = grid.all_states()
	for s in states:
		if s in grid.actions:
			V[s] = np.random.random()
		else:
			V[s] = 0
Exemple #4
0
from gridworld import standard_grid, negative_grid
from iter_policy_eval_moving_penalty import print_values, print_policy

SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

# next state and reward will now have some randomness
# you'll go in your desired direction with probability 0.5
# you'll go in a random direction a' != a with probability 0.5/3

if __name__ == '__main__':
  # this grid gives you a reward of -0.1 for every non-terminal state
  # we want to see if this will encourage finding a shorter path to the goal
  grid = negative_grid(step_cost=-1.0)
  # grid = negative_grid(step_cost=-0.1)
  # grid = standard_grid()

  # print rewards
  print("rewards:")
  print_values(grid.rewards, grid)

  # state -> action
  # we'll randomly choose an action and update as we learn
  policy = {}
  for s in grid.actions.keys():
    policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)

  # initial policy
  print("initial policy:")
def main(grid_type='negative'):
    # NOTE: every p(s',r|s,a) is deterministic (1 or 0)
    if grid_type == 'negative':
        # get the grid:
        grid = negative_grid()

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # print the rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)  # prints any dict with
    # a tuple of numbers as the key
    # and a number as the value

    # STEP 1: randomly initialize V(s) and the policy, pi(s):
    V = {}
    states = grid.all_states
    for s in states:
        # we can simply initialize all to zero:
        V[s] = 0
        # or perform a random initialization:
        # if s in grid.actions: # if not a terminal state
        # 	V[s] = np.random.random()
        # else:
        # 	# terminal
        # 	V[s] = 0
    print('\ninitial values:')
    print_values(V, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print('\ninitial policy:')
    print_policy(policy, grid)

    # STEP 2: alternate between policy evaluation and policy improvement:
    # repeat untill convergence:
    i = 0
    while True:

        # STEP 2A: iterative policy evaluation
        while True:
            # NOTE: all of the actions, next states and rewards
            #       are considered deterministic

            max_change = 0
            for s in states:
                old_v = V[s]  # save the old value of the state

                # check if not a terminal state:
                if s in grid.actions:
                    grid.set_state(s)

                    # take an action according to the policy and get the reward:
                    a = policy[s]
                    r = grid.move(a)

                    # the "look-ahead" - get the value of the next state, s_prime:
                    s_prime = grid.current_state
                    # s_prime is needed in order to calculate
                    # the value of the current state - the Bellman equation:
                    V[s] = r + GAMMA * V[s_prime]

                    # update max_change:
                    max_change = max(max_change, np.abs(V[s] - old_v))

            # check if converged:
            if max_change < THRESHOLD:
                break

        # STEP 2B: policy iteration
        # for each state we take an action according to the policy
        # and check whether there is a better action - take all possible
        # actions from that state and calculate the values;
        # we choose the action that results in the max value of the state.
        policy_improved = False
        for s in states:

            # check if not a terminal-state:
            if s in grid.actions:
                grid.set_state(s)  # yep, don't forget to set the position!

                # save the old policy:
                old_a = policy[s]

                max_v = np.float('-inf')  # worse is unlikely to occur

                # choose the best action among all the possible ones:
                for a in ALL_POSSIBLE_ACTIONS:
                    # print('reached here!')
                    grid.set_state(s)

                    # take an action, receive your keto-chocolate bar:
                    r = grid.move(a)

                    s_prime = grid.current_state
                    new_v = r + GAMMA * V[s_prime]

                    # compare the values:
                    if new_v > max_v:
                        max_v = new_v
                        better_a = a
                        # change the policy:
                        policy[s] = better_a

                if old_a != better_a:
                    # print('policy_improved')
                    policy_improved = True

        # if policy has changed, we need to recalculate the values of all states -
        # get back to STEP 2A;
        # else - we're done!
        # and since the policy's not changed, the values remain the same:
        if not policy_improved:
            break

        i += 1

    print('\niterations to converge:', i)

    # print the values:
    print('\nvalues:')
    print_values(V, grid)

    # print the policy:
    print('\nthe improved policy:')
    print_policy(policy, grid)
def main(grid_type='negative'):
    # NOTE: every p(s',r|s,a) is now random, i.e. lies in [0,1],
    #       but the policy is deterministic!
    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # print the rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)  # prints any dict with
    # a tuple of numbers as the key
    # and a number as the value

    # STEP 1: randomly initialize V(s) and the policy, pi(s):
    V = {}
    states = grid.all_states
    for s in states:
        # we can simply initialize all to zero:
        # V[s] = 0
        # or perform a random initialization:
        if s in grid.actions:  # if not a terminal state
            V[s] = np.random.random()
        else:
            # terminal
            V[s] = 0
    print('\ninitial values:')
    print_values(V, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print('\ninitial policy:')
    print_policy(policy, grid)

    # STEP 2: alternate between policy evaluation and policy improvement
    #         with random state-transitions:
    # repeat untill convergence:
    i = 0
    while True:

        # STEP 2A: iterative policy evaluation
        while True:
            max_change = 0

            for s in states:
                old_v = V[s]  # save the old value of the state
                new_v = 0

                # check if not a terminal state:
                if s in grid.actions:

                    for a in ALL_POSSIBLE_ACTIONS:

                        grid.set_state(s)

                        # possible_actions = list(grid.actions[s])
                        # print('\npossible actions from the state (%d, %d):' % grid.current_state)
                        # print(possible_actions)

                        if a == policy[s]:
                            # take this action with the probability p(a|s)=P_A:
                            p_s_prime_and_r = P_A

                        else:
                            p_s_prime_and_r = (1 - P_A) / (
                                len(ALL_POSSIBLE_ACTIONS) - 1)
                            # same as: p(s',r|s,!policy[s])

                        # move in the chosen direciton:
                        r = grid.move(a)

                        # the "look-ahead" - get the value of the next state, s_prime:
                        s_prime = grid.current_state
                        # s_prime is needed in order to calculate
                        # the value of the current state - the Bellman equation:
                        new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime])

                V[s] = new_v

                # update max_change:
                max_change = max(max_change, np.abs(V[s] - old_v))

            # check if converged:
            if max_change < THRESHOLD:
                break

        # STEP 2B: policy iteration
        # for each state we take an action according to the policy
        # and check whether there is a better action - take all possible
        # actions from that state and calculate the values, but now we also
        # take into account that our state-transitions are random!!!
        # we then choose the action that results in the max value of the state.
        policy_improved = False
        for s in states:

            # check if not a terminal-state:
            if s in grid.actions:
                grid.set_state(s)  # yep, don't forget to set the position!

                # save the old policy:
                old_a = policy[s]

                max_v = np.float('-inf')  # worse is unlikely to occur

                # choose the best action among all the possible ones:
                for a in ALL_POSSIBLE_ACTIONS:
                    # print('reached here!')
                    new_v = 0  # we're to accumulate the value

                    for another_a in ALL_POSSIBLE_ACTIONS:
                        grid.set_state(s)

                        # since the state-transitions are random,
                        # we check if the action is desired:
                        if another_a == a:
                            # take this action with the probability p(a|s)=0.5:
                            p_s_prime_and_r = P_A

                        else:
                            p_s_prime_and_r = (1 - P_A) / (
                                len(ALL_POSSIBLE_ACTIONS) - 1)

                        # take an action, receive your keto-chocolate bar:
                        r = grid.move(another_a)

                        s_prime = grid.current_state
                        new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime])

                    # compare the values:
                    if new_v > max_v:
                        max_v = new_v
                        better_a = a
                        # change the policy:
                        policy[s] = better_a

                if old_a != better_a:
                    # print('policy_improved')
                    policy_improved = True

        # if policy has changed, we need to recalculate the values of all states -
        # get back to STEP 2A;
        # else - we're done!
        # and since the policy's not changed, the values remain the same:
        if not policy_improved:
            break

        i += 1

    print('\niterations to converge:', i)

    # print the values:
    print('\nvalues:')
    print_values(V, grid)

    # print the policy:
    print('\nthe improved policy:')
    print_policy(policy, grid)
def main(grid_type='negative'):
    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # display rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)

    states = grid.all_states

    # STEP 1: randomly initialize the value function, V(s):
    V = {}  # the values
    for s in states:
        # as an option, initialize to 0:
        # V[s] = 0

        # check if not a terminal state:
        if s in grid.actions:
            V[s] = np.random.random()
        else:
            V[s] = 0

    print('\ninitial values:')
    print_values(V, grid)

    # STEP 2: value iteration
    while True:
        max_change = 0

        for s in states:
            old_v = V[s]

            # if we're not in a terminal state:
            if s in grid.actions:
                # choose an action that results in the maximum value
                # for this state:
                best_v = np.float('-inf')
                # best_a = np.random.choice(ALL_POSSIBLE_ACTIONS)

                for a in ALL_POSSIBLE_ACTIONS:
                    # arrive in the state:
                    grid.set_state(s)

                    # take the action and receive the reward:
                    r = grid.move(a)

                    # calculate the Bellman equation:
                    v = r + GAMMA * V[grid.current_state]

                    if v > best_v:
                        best_v = v
                        # p[s] = a      # we'll do it in another loop later

                # update the value of this state:
                V[s] = best_v

                # update the maximum change:
                max_change = max(max_change, np.abs(old_v - V[s]))

        # check if converged:
        if max_change < THRESHOLD:
            break

    # STEP 3: take our optimal value funciton
    #         and find our optimal policy
    p = {}  # the policy
    for s in states:
        best_a = None
        best_v = float('-inf')

        # if not a terminal state:
        if s in grid.actions:
            # find the best action:
            for a in ALL_POSSIBLE_ACTIONS:
                grid.set_state(s)
                r = grid.move(a)
                v = r + GAMMA * V[grid.current_state]

                if v > best_v:
                    best_v = v
                    best_a = a

            p[s] = best_a

    # optimal values:
    print('\noptimal values:')
    print_values(V, grid)

    # optimal policy:
    print('\noptimal policy:')
    print_policy(p, grid)