Ejemplo n.º 1
0
def play_game(grid, policy):
    # returns a list of states and corresponding returns
    print("\n Playing Game with Policy: ")
    print_policy(policy, grid)
    # reset game to start at random position
    # we need to do this, because our current deterministic policy would
    # ... never end up at certain states, but we want to measure their reward
    s = (2, 0)
    grid.set_state(s)

    # play the game
    print("\nStarting State for the Game is: {}".format(s))

    # each triple is s(t), a(t), r(t)
    # but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t)
    states_and_rewards = [(s, 0)]
    num_steps = 0

    while not grid.game_over():
        # play the game
        print("\nState at move {} : {}".format(num_steps + 1, s))
        # play until the game finishes
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        print("Action at move {} : {}".format(num_steps + 1, a))
        num_steps += 1
        s = grid.current_state()
        states_and_rewards.append((s, r))
    return states_and_rewards
def play_game(grid, policy):
    # returns a list of states and corresponding returns
    print("\n Playing Game with Policy: ")
    print_policy(policy, grid)
    # reset game to start at random position
    # we need to do this, because our current deterministic policy would
    # ... never end up at certain states, but we want to measure their reward
    s = (2, 0)
    grid.set_state(s)

    # play the game
    print("\nStarting State for the Game is: {}".format(s))
    a = random_action(policy[s])
    print("Starting Action for the Game is {}".format(a))

    # each triple is s(t), a(t), r(t)
    # but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t)
    states_actions_rewards = [(s, a, 0)]
    num_steps = 0

    while True:
        # play the game
        print("\nState at move {} : {}".format(num_steps+1, s))
        print("Action at move {} : {}".format(num_steps+1, a))
        # play until the game finishes
        r = grid.move(a)
        num_steps += 1
        s = grid.current_state()
        if grid.game_over():
            states_actions_rewards.append((s, None, r))
            break
        else:
            a = random_action(policy[s])
            states_actions_rewards.append((s, a, r))


    # calculate the returns by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed(states_actions_rewards):
        # the value of the terminal state is 0 by definition
        # we should ignore the first state we encounter
        # and ignore the last G, which is meaningless since it doesn't correspond to any move
        if first:
            first = False
        else:
            states_actions_returns.append((s, a, G))
        G = r + GAMMA * G
    states_actions_returns.reverse()  # we want it to be in order of state visited
    print("\nState Action Return (G): {}".format(states_actions_returns))
    return states_actions_returns
        # ... then initialise a return list for the state
        if s in grid.actions:
            returns[s] = []
        else:
            # terminal state or state we can't otherwise get to
            V[s] = 0

    # repeat
    for t in range(100):
        print('\n')
        print(t)
        # generate an episode using pi
        states_and_returns = play_game(grid, policy)
        # get a list of states and associated G values (expected future reward for this value)
        seen_states = set()  # get unique states
        for s, G in states_and_returns:
            # for all states and expected future rewards,
            # check if we have already seen s
            # called "first-visit" MC policy evaluation
            if s not in seen_states:
                returns[s].append(G)  # add the G to the returns for the chosen state
                print("returns:{}".format(returns))
                V[s] = np.mean(returns[s])  # update the mean value for the state
                print("v(s):{}".format(V))
                seen_states.add(s)

    print("values:")
    print_values(V, grid)
    print("policy:")
    print_policy(policy, grid)
        # we will update Q(s,a) AS we experience the episode
        model.theta += alpha*(r + GAMMA*model.predict(s2, a2) - model.predict(s, a))*model.grad(s, a)
        
        # next state becomes current state
        s = s2
        a = a2

      biggest_change = max(biggest_change, np.abs(model.theta - old_theta).sum())
    deltas.append(biggest_change)

  plt.plot(deltas)
  plt.show()

  # determine the policy from Q*
  # find V* from Q*
  policy = {}
  V = {}
  Q = {}
  for s in grid.actions.keys():
    Qs = getQs(model, s)
    Q[s] = Qs
    a, max_q = max_dict(Qs)
    policy[s] = a
    V[s] = max_q

  print "values:"
  print_values(V, grid)
  print "policy:"
  print_policy(policy, grid)
def main(grid_type='negative'):
    # NOTE: every p(s',r|s,a) is deterministic (1 or 0)
    if grid_type == 'negative':
        # get the grid:
        grid = negative_grid()

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # print the rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)  # prints any dict with
    # a tuple of numbers as the key
    # and a number as the value

    # STEP 1: randomly initialize V(s) and the policy, pi(s):
    V = {}
    states = grid.all_states
    for s in states:
        # we can simply initialize all to zero:
        V[s] = 0
        # or perform a random initialization:
        # if s in grid.actions: # if not a terminal state
        # 	V[s] = np.random.random()
        # else:
        # 	# terminal
        # 	V[s] = 0
    print('\ninitial values:')
    print_values(V, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print('\ninitial policy:')
    print_policy(policy, grid)

    # STEP 2: alternate between policy evaluation and policy improvement:
    # repeat untill convergence:
    i = 0
    while True:

        # STEP 2A: iterative policy evaluation
        while True:
            # NOTE: all of the actions, next states and rewards
            #       are considered deterministic

            max_change = 0
            for s in states:
                old_v = V[s]  # save the old value of the state

                # check if not a terminal state:
                if s in grid.actions:
                    grid.set_state(s)

                    # take an action according to the policy and get the reward:
                    a = policy[s]
                    r = grid.move(a)

                    # the "look-ahead" - get the value of the next state, s_prime:
                    s_prime = grid.current_state
                    # s_prime is needed in order to calculate
                    # the value of the current state - the Bellman equation:
                    V[s] = r + GAMMA * V[s_prime]

                    # update max_change:
                    max_change = max(max_change, np.abs(V[s] - old_v))

            # check if converged:
            if max_change < THRESHOLD:
                break

        # STEP 2B: policy iteration
        # for each state we take an action according to the policy
        # and check whether there is a better action - take all possible
        # actions from that state and calculate the values;
        # we choose the action that results in the max value of the state.
        policy_improved = False
        for s in states:

            # check if not a terminal-state:
            if s in grid.actions:
                grid.set_state(s)  # yep, don't forget to set the position!

                # save the old policy:
                old_a = policy[s]

                max_v = np.float('-inf')  # worse is unlikely to occur

                # choose the best action among all the possible ones:
                for a in ALL_POSSIBLE_ACTIONS:
                    # print('reached here!')
                    grid.set_state(s)

                    # take an action, receive your keto-chocolate bar:
                    r = grid.move(a)

                    s_prime = grid.current_state
                    new_v = r + GAMMA * V[s_prime]

                    # compare the values:
                    if new_v > max_v:
                        max_v = new_v
                        better_a = a
                        # change the policy:
                        policy[s] = better_a

                if old_a != better_a:
                    # print('policy_improved')
                    policy_improved = True

        # if policy has changed, we need to recalculate the values of all states -
        # get back to STEP 2A;
        # else - we're done!
        # and since the policy's not changed, the values remain the same:
        if not policy_improved:
            break

        i += 1

    print('\niterations to converge:', i)

    # print the values:
    print('\nvalues:')
    print_values(V, grid)

    # print the policy:
    print('\nthe improved policy:')
    print_policy(policy, grid)
Ejemplo n.º 6
0
    states = g.all_states()
    V = {}
    policy = {}
    for s in states:
        if s in g.actions:
            V[s] = np.random.random()
        else:
            V[s] = 0

    #print(g.actions.keys())
    #Initialize random policy
    for s in g.actions.keys():
        policy[s] = np.random.choice(all_actions)

    print_policy(policy, g)

    Iter = 0
    while True:
        #Iterative Policy Iteration
        while Iter < 1000:
            Iter += 1
            print("Iteration %d" % Iter)
            biggest_change = 0
            #Backup old policy
            for s in states:
                old_v = V[s]
                #V[s] has value only if its not a terminal state
                if s in policy:
                    a = policy[s]
                    g.set_state(s)
def main(grid_type='negative'):
    # NOTE: every p(s',r|s,a) is now random, i.e. lies in [0,1],
    #       but the policy is deterministic!
    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # print the rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)  # prints any dict with
    # a tuple of numbers as the key
    # and a number as the value

    # STEP 1: randomly initialize V(s) and the policy, pi(s):
    V = {}
    states = grid.all_states
    for s in states:
        # we can simply initialize all to zero:
        # V[s] = 0
        # or perform a random initialization:
        if s in grid.actions:  # if not a terminal state
            V[s] = np.random.random()
        else:
            # terminal
            V[s] = 0
    print('\ninitial values:')
    print_values(V, grid)

    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print('\ninitial policy:')
    print_policy(policy, grid)

    # STEP 2: alternate between policy evaluation and policy improvement
    #         with random state-transitions:
    # repeat untill convergence:
    i = 0
    while True:

        # STEP 2A: iterative policy evaluation
        while True:
            max_change = 0

            for s in states:
                old_v = V[s]  # save the old value of the state
                new_v = 0

                # check if not a terminal state:
                if s in grid.actions:

                    for a in ALL_POSSIBLE_ACTIONS:

                        grid.set_state(s)

                        # possible_actions = list(grid.actions[s])
                        # print('\npossible actions from the state (%d, %d):' % grid.current_state)
                        # print(possible_actions)

                        if a == policy[s]:
                            # take this action with the probability p(a|s)=P_A:
                            p_s_prime_and_r = P_A

                        else:
                            p_s_prime_and_r = (1 - P_A) / (
                                len(ALL_POSSIBLE_ACTIONS) - 1)
                            # same as: p(s',r|s,!policy[s])

                        # move in the chosen direciton:
                        r = grid.move(a)

                        # the "look-ahead" - get the value of the next state, s_prime:
                        s_prime = grid.current_state
                        # s_prime is needed in order to calculate
                        # the value of the current state - the Bellman equation:
                        new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime])

                V[s] = new_v

                # update max_change:
                max_change = max(max_change, np.abs(V[s] - old_v))

            # check if converged:
            if max_change < THRESHOLD:
                break

        # STEP 2B: policy iteration
        # for each state we take an action according to the policy
        # and check whether there is a better action - take all possible
        # actions from that state and calculate the values, but now we also
        # take into account that our state-transitions are random!!!
        # we then choose the action that results in the max value of the state.
        policy_improved = False
        for s in states:

            # check if not a terminal-state:
            if s in grid.actions:
                grid.set_state(s)  # yep, don't forget to set the position!

                # save the old policy:
                old_a = policy[s]

                max_v = np.float('-inf')  # worse is unlikely to occur

                # choose the best action among all the possible ones:
                for a in ALL_POSSIBLE_ACTIONS:
                    # print('reached here!')
                    new_v = 0  # we're to accumulate the value

                    for another_a in ALL_POSSIBLE_ACTIONS:
                        grid.set_state(s)

                        # since the state-transitions are random,
                        # we check if the action is desired:
                        if another_a == a:
                            # take this action with the probability p(a|s)=0.5:
                            p_s_prime_and_r = P_A

                        else:
                            p_s_prime_and_r = (1 - P_A) / (
                                len(ALL_POSSIBLE_ACTIONS) - 1)

                        # take an action, receive your keto-chocolate bar:
                        r = grid.move(another_a)

                        s_prime = grid.current_state
                        new_v += p_s_prime_and_r * (r + GAMMA * V[s_prime])

                    # compare the values:
                    if new_v > max_v:
                        max_v = new_v
                        better_a = a
                        # change the policy:
                        policy[s] = better_a

                if old_a != better_a:
                    # print('policy_improved')
                    policy_improved = True

        # if policy has changed, we need to recalculate the values of all states -
        # get back to STEP 2A;
        # else - we're done!
        # and since the policy's not changed, the values remain the same:
        if not policy_improved:
            break

        i += 1

    print('\niterations to converge:', i)

    # print the values:
    print('\nvalues:')
    print_values(V, grid)

    # print the policy:
    print('\nthe improved policy:')
    print_policy(policy, grid)
Ejemplo n.º 8
0
    # display rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)

    states = grid.all_states

    # initialize value function and number of visits per state:
    V = {}
    N = {}
    for s in states:
        V[s] = 0
        N[s] = 0

    ############################# First-Visit Monte Carlo: #############################
    for i in range(10000):
        states_and_returns = play_game(grid, POLICY)
        visited_s = set()
        for s, G in states_and_returns:
            if s not in visited_s:
                N[s] += 1
                V[s] = (1 - 1 / N[s]) * V[s] + (1 / N[s]) * G

    # print values:
    print('\nvalues:')
    print_values(V, grid)

    # print policy:
    print('\npolicy:')
    print_policy(POLICY, grid)
            v = V[s]
            max_val = float("-inf")
            for action in grid.actions[s]:
                grid.set_state(s)
                r = grid.move(action)
                val = r + gamma * V[grid.current_state()]
                if val > max_val:
                    max_val = val
            V[s] = max_val
            delta = max(delta, abs(v - V[s]))

        if delta < theta:
            print_values(V, grid)
            break

    # Output a deterministic policy (which is optimal)
    pi = {}
    for s in grid.actions:
        max_val = float("-inf")
        for action in grid.actions[s]:
            grid.set_state(s)
            r = grid.move(action)
            val = r + V[grid.current_state()]
            if val > max_val:
                max_val = val
                max_action = action

        pi[s] = max_action

    print_policy(pi, grid)
Ejemplo n.º 10
0
    plt.plot(reward_per_episode)
    plt.title("Reward per episode")
    plt.show()

    # obtain V* and pi*
    V = {}
    greedy_policy = {}
    states = grid.all_states()
    for s in states:
        if s in grid.actions:
            values = model.predict_all_actions(s)
            V[s] = np.max(values)
            greedy_policy[s] = ALL_POSSIBLE_ACTIONS[np.argmax(values)]
        else:
            # terminal state or state we can't otherwise get to
            V[s] = 0

    print("values:")
    print_values(V, grid)
    print("policy:")
    print_policy(greedy_policy, grid)

    print("state_visit_count:")
    state_sample_count_arr = np.zeros((grid.rows, grid.cols))
    for i in range(grid.rows):
        for j in range(grid.cols):
            if (i, j) in state_visit_count:
                state_sample_count_arr[i, j] = state_visit_count[(i, j)]
    df = pd.DataFrame(state_sample_count_arr)
    print(df)
Ejemplo n.º 11
0
def play_game(grid, policy):
    # returns a list of states and corresponding returns
    print("\n Playing Game with Policy: ")
    print_policy(policy, grid)
    # reset game to start at random position
    # we need to do this, because our current deterministic policy would
    # ... never end up at certain states, but we want to measure their reward
    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    # play the game
    s = grid.current_state()
    print("\nStarting State for the Game is: {}".format(s))
    a = np.random.choice(ALL_POSSIBLE_ACTIONS)
    print("Starting Action for the Game is {}".format(a))

    # each triple is s(t), a(t), r(t)
    # but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t)
    states_actions_rewards = [(s, a, 0)]
    seen_states = set()
    seen_states.add(grid.current_state())
    num_steps = 0

    while True:
        # play the game
        print("\nState at move {} : {}".format(num_steps + 1, s))
        print("Action at move {} : {}".format(num_steps + 1, a))
        # play until the game finishes
        r = grid.move(a)
        num_steps += 1
        s = grid.current_state()

        if s in seen_states:
            # hack so that we don't end up in an infinitely long episode
            # bumping into the wall repeatedly
            # if num_steps == 1 -> bumped into a wall and haven't moved anywhere
            #   reward = -10
            # else:
            #   reward = falls off by 1 / num_steps
            reward = -10. / num_steps
            states_actions_rewards.append((s, None, reward))
            break
        elif grid.game_over():
            states_actions_rewards.append((s, None, r))
            break
        else:
            # THE FIRST MOVE IS RANDOM, BUT PAST THIS ITS ACCORDING TO THE POLICY
            # THIS NEEDS TO BE THE CASE OTHERWISE WE WOULD NEVER REACH CERTAIN STATES USING
            # OUR DETERMINISTIC POLICY
            a = policy[s]
            states_actions_rewards.append((s, a, r))
        seen_states.add(s)
    print("\nState Action Reward: {}".format(states_actions_rewards))

    # calculate the returns by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed(states_actions_rewards):
        # the value of the terminal state is 0 by definition
        # we should ignore the first state we encounter
        # and ignore the last G, which is meaningless since it doesn't correspond to any move
        if first:
            first = False
        else:
            states_actions_returns.append((s, a, G))
        G = r + GAMMA * G
    states_actions_returns.reverse(
    )  # we want it to be in order of state visited
    print("\nState Action Return (G): {}".format(states_actions_returns))
    return states_actions_returns
def main(grid_type='negative'):
    if grid_type == 'negative':
        step_cost = float(
            input('\nenter step_cost (e.g. \'-1\' or \'-0.1\'):\n').strip())
        # get the grid:
        grid = negative_grid(step_cost=step_cost)

    else:
        # assuming the standard grid:
        grid = standard_grid()

    # display rewards:
    print('\nrewards:')
    print_values(grid.rewards, grid)

    states = grid.all_states

    # STEP 1: randomly initialize the value function, V(s):
    V = {}  # the values
    for s in states:
        # as an option, initialize to 0:
        # V[s] = 0

        # check if not a terminal state:
        if s in grid.actions:
            V[s] = np.random.random()
        else:
            V[s] = 0

    print('\ninitial values:')
    print_values(V, grid)

    # STEP 2: value iteration
    while True:
        max_change = 0

        for s in states:
            old_v = V[s]

            # if we're not in a terminal state:
            if s in grid.actions:
                # choose an action that results in the maximum value
                # for this state:
                best_v = np.float('-inf')
                # best_a = np.random.choice(ALL_POSSIBLE_ACTIONS)

                for a in ALL_POSSIBLE_ACTIONS:
                    # arrive in the state:
                    grid.set_state(s)

                    # take the action and receive the reward:
                    r = grid.move(a)

                    # calculate the Bellman equation:
                    v = r + GAMMA * V[grid.current_state]

                    if v > best_v:
                        best_v = v
                        # p[s] = a      # we'll do it in another loop later

                # update the value of this state:
                V[s] = best_v

                # update the maximum change:
                max_change = max(max_change, np.abs(old_v - V[s]))

        # check if converged:
        if max_change < THRESHOLD:
            break

    # STEP 3: take our optimal value funciton
    #         and find our optimal policy
    p = {}  # the policy
    for s in states:
        best_a = None
        best_v = float('-inf')

        # if not a terminal state:
        if s in grid.actions:
            # find the best action:
            for a in ALL_POSSIBLE_ACTIONS:
                grid.set_state(s)
                r = grid.move(a)
                v = r + GAMMA * V[grid.current_state]

                if v > best_v:
                    best_v = v
                    best_a = a

            p[s] = best_a

    # optimal values:
    print('\noptimal values:')
    print_values(V, grid)

    # optimal policy:
    print('\noptimal policy:')
    print_policy(p, grid)