def play_game(grid, policy): ''' returns a list of states their returns and we dont use ex ''' s = (2, 0) grid.set_state(s) a = random_action(policy[s]) #be aware of timing each triple is s(t), a(t), r(t) #but r(t0 results in aking action a(t-1) from s(t-1) to land at s(t) states_actions_rewards = [(s, a, 0)] while True: r = grid.move(a) s = grid.current_state() if grid.game_over(): states_actions_rewards.append((s, None, r)) break else: a = random_action(policy[s]) states_actions_rewards.append((s, a, r)) #calculate returns by working back from terminal state G = 0 states_actions_returns = [] first = True for s, a, r in reversed((states_actions_rewards)): if first: first = False else: states_actions_returns.append((s, a, G)) G = r + GAMMA * G states_actions_returns.reverse() return states_actions_returns
def play_game(grid, policy): ''' reset game to start at random position we need to do this because given our current deterministic policy we would never end up at certain states, but we still want to measure them :param grid: the grid class object :param policy: dictionary containing policies :return: a list of states and corresponding returns ''' start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) s = grid.current_state() states_and_rewards = [(s, 0)] # list of tuples of (state,reward) while not grid.game_over(): a = policy[s] r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) #calculate returns by working backwards from terminal state G = 0 states_and_returns = [] first = True for s, r in reversed(states_and_rewards): #the value of the terminal state is 0 by definition #we should ignore the first state we encounter if first: first = False else: states_and_returns.append((s, G)) G = r + GAMMA * G states_and_returns.reverse() # we want it to be in order of state visited return states_and_returns
def play_game(grid, policy): s = (2, 0) grid.set_state(s) states_and_rewards = [(s, 0)] while not grid.game_over(): a = policy[s] a = random_action(a) r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) return states_and_rewards
s = (2, 0) grid.set_state(s) #get Q(s) to choose first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5 / t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() #need next action since Q(s,a) depends on Q(s',a') old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * (r - model.predict(s, a)) * model.grad( s, a) else: #not terminal Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0] a2 = random_action(a2, eps=0.5 / t) #epsilon greedy model.theta += alpha * (r + GAMMA * model.predict(s2, a2) - model.predict(s, a)) * model.grad(
def play_game(grid, policy): ''' reset game to start at random position we need to do this because given our current deterministic policy we would never end up at certain states, but we still want to measure them :param grid: the grid class object :param policy: dictionary containing policies :return: a list of states and corresponding returns ''' start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) s = grid.current_state() a = np.random.choice(ALL_POSSIBLE_ACTIONS) #first action is uniformly random #be aware of timing #each triple is s(t), a(t), r(t) #but r(t) results from taking action a(t-1) from s(t-1) and landing in s(t) states_actions_rewards = [(s, a, 0)] seen_states = set() seen_states.add(grid.current_state()) num_steps = 0 while True: r = grid.move(a) num_steps += 1 s = grid.current_state() if s in seen_states: # hack so that we don't end up in an infinitely long episode # bumping into the wall repeatedly # if num_steps == 1 -> bumped into a wall and haven't moved anywhere # reward = -10 # else: # reward = falls off by 1 / num_steps reward = -10. / num_steps states_actions_rewards.append((s, None, reward)) break elif grid.game_over(): states_actions_rewards.append((s, None, r)) break else: a = policy[s] states_actions_rewards.append((s, a, r)) seen_states.add(s) # calculate the returns by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for s, a, r in reversed(states_actions_rewards): #the value of the terminal state is 0 so we ingore first state #and we ignore the last G which is meaningless since it doesnt correspond if first: first = False else: states_actions_returns.append((s,a,G)) G = r + GAMMA*G states_actions_returns.reverse() # we want it to be in order of state visited return states_actions_returns