def play_game(grid, policy):
    '''
    reset game to start at random position
    we need to do this because given our current deterministic policy
    we would never end up at certain states, but we still want to measure them
    :param grid: the grid class object
    :param policy: dictionary containing policies
    :return: a list of states and corresponding returns
    '''

    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    s = grid.current_state()
    states_and_rewards = [(s, 0)]  # list of tuples of (state,reward)
    while not grid.game_over():
        a = policy[s]
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    #calculate returns by working backwards from terminal state
    G = 0
    states_and_returns = []
    first = True
    for s, r in reversed(states_and_rewards):
        #the value of the terminal state is 0 by definition
        #we should ignore the first state we encounter
        if first:
            first = False
        else:
            states_and_returns.append((s, G))
        G = r + GAMMA * G
    states_and_returns.reverse()  # we want it to be in order of state visited
    return states_and_returns
Esempio n. 2
0
def play_game(grid, policy):
    '''
    returns a list of states their returns and we dont use ex
    '''
    s = (2, 0)
    grid.set_state(s)
    a = random_action(policy[s])

    #be aware of timing each triple is s(t), a(t), r(t)
    #but r(t0 results in aking action a(t-1) from s(t-1) to land at s(t)
    states_actions_rewards = [(s, a, 0)]
    while True:
        r = grid.move(a)
        s = grid.current_state()
        if grid.game_over():
            states_actions_rewards.append((s, None, r))
            break
        else:
            a = random_action(policy[s])
            states_actions_rewards.append((s, a, r))

    #calculate returns by working back from terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed((states_actions_rewards)):
        if first:
            first = False
        else:
            states_actions_returns.append((s, a, G))
        G = r + GAMMA * G
    states_actions_returns.reverse()
    return states_actions_returns
Esempio n. 3
0
def play_game(grid, policy):
    s = (2, 0)
    grid.set_state(s)
    states_and_rewards = [(s, 0)]
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    return states_and_rewards
        grid.set_state(s)

        #get Q(s) to choose first action
        Qs = getQs(model, s)

        # the first (s, r) tuple is the state we start in and 0
        # (since we don't get a reward) for simply starting the game
        # the last (s, r) tuple is the terminal state and the final reward
        # the value for the terminal state is by definition 0, so we don't
        # care about updating it.
        a = max_dict(Qs)[0]
        a = random_action(a, eps=0.5 / t)
        biggest_change = 0
        while not grid.game_over():
            r = grid.move(a)
            s2 = grid.current_state()

            #need next action since Q(s,a) depends on Q(s',a')
            old_theta = model.theta.copy()
            if grid.is_terminal(s2):
                model.theta += alpha * (r - model.predict(s, a)) * model.grad(
                    s, a)
            else:
                #not terminal
                Qs2 = getQs(model, s2)
                a2 = max_dict(Qs2)[0]
                a2 = random_action(a2, eps=0.5 / t)  #epsilon greedy

                model.theta += alpha * (r + GAMMA * model.predict(s2, a2) -
                                        model.predict(s, a)) * model.grad(
                                            s, a)
Esempio n. 5
0
    #repeat until convergence
    #V[s] = max[a]{sum[s',r] {p(s',r|s,a)[r + GAMMA * V[s']]}}
    while True:
        max_change = 0
        for s in states:
            old_vs = V[s]

            #V[s] only has policy if not a terminal state
            if s in policy:
                new_v = float('-inf')

                #find max[a]
                for a in ACTIONS:
                    grid.set_state(s)
                    r = grid.move(a)
                    v = r + GAMMA * V[grid.current_state()]
                    if v > new_v:
                        new_v = v
                V[s] = new_v
                biggest_change = max(max_change, np.abs(old_vs - V[s]))

        #when the value function converges break out of the loop
        if max_change < thresh:
            break
    #find a policy that leads to optimal value function
    for s in policy.keys():
        best_act = None
        best_value = float('-inf')
        for a in ACTIONS:
            grid.set_state(s)
            r = grid.move(a)
Esempio n. 6
0
def play_game(grid, policy):
    '''
    reset game to start at random position
    we need to do this because given our current deterministic policy
    we would never end up at certain states, but we still want to measure them
    :param grid: the grid class object
    :param policy: dictionary containing policies
    :return: a list of states and corresponding returns
    '''

    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    s = grid.current_state()
    a = np.random.choice(ALL_POSSIBLE_ACTIONS) #first action is uniformly random

    #be aware of timing
    #each triple is s(t), a(t), r(t)
    #but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t)
    states_actions_rewards = [(s, a, 0)]
    seen_states = set()
    seen_states.add(grid.current_state())
    num_steps = 0
    while True:
        r = grid.move(a)
        num_steps += 1
        s = grid.current_state()

        if s in seen_states:
            # hack so that we don't end up in an infinitely long episode
            # bumping into the wall repeatedly
            # if num_steps == 1 -> bumped into a wall and haven't moved anywhere
            #   reward = -10
            # else:
            #   reward = falls off by 1 / num_steps
            reward = -10. / num_steps
            states_actions_rewards.append((s, None, reward))
            break
        elif grid.game_over():
            states_actions_rewards.append((s, None, r))
            break
        else:
            a = policy[s]
            states_actions_rewards.append((s, a, r))
        seen_states.add(s)

    # calculate the returns by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed(states_actions_rewards):
        #the value of the terminal state is 0 so we ingore first state
        #and we ignore the last G which is meaningless since it doesnt correspond
        if first:
            first = False
        else:
            states_actions_returns.append((s,a,G))
        G = r + GAMMA*G
    states_actions_returns.reverse() # we want it to be in order of state visited
    return states_actions_returns
	#repeat until convergence
	while True:
		biggest_change = 0
		for s in states:
			old_v = V[s]


			#V(s) only has value if its not a terminal state
			if s in grid.actions:

				new_v = 0 # we will accumulate the answer
				p_a = 1.0/len(grid.actions[s]) #each actions has equal prob since uniform
				for a in grid.actions[s]:
					grid.set_state(s)
					r = grid.move(a)
					new_v += p_a * (r + gamma*V[grid.current_state()])
				V[s] = new_v
				biggest_change = max(biggest_change, np.abs(old_v - V[s]))
		if biggest_change < SMALL_ENOUGH:
			break
	print("Values for uniformly random actions:")
	print_values(V,grid)
	print('\n\n')


	### fixed policy ###
	policy = {
			(2,0): 'U',
			(1,0): 'U',
			(0,0): 'R',
			(0,1): 'R',