def test_td0_method(alpha=0.1, gamma=0.9, eps=0.1): game = StandardGrid() print_values(game.rewards, game) policy = { ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left (0, 0): [0, 0, 1, 0], (0, 1): [1, 0, 0, 0], (0, 2): [1, 0, 0, 0], (1, 0): [0, 0, 1, 0], (1, 2): [0, 0, 1, 0], (2, 0): [0, 0, 1, 0], (2, 1): [0, 0, 1, 0], (2, 2): [0, 0, 1, 0], (3, 2): [1, 0, 0, 0] } print("Policy:") print_policy_beautifly(policy, game) V = {} states = game.allStates() for s in states: V[s] = 0 for t in xrange(100000): s_and_rs = play_game(game, policy, False, eps) for i in xrange(len(s_and_rs) - 1): state, _ = s_and_rs[i] s_plus_one, reward = s_and_rs[i + 1] V[state] = V[state] + alpha * (reward + gamma * V[s_plus_one] - V[state]) print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game)
def testMonteCarloValuesEvaluation(windy=False): gamma = 0.9 eps = 0.2 game = StandardGrid()#NegativeGrid(-0.25) ## This is to make things real #generateRandomPolicy() policy = generateRandomlyDeterministicPolicy(game) V = {} returns = {} states = game.allStates() seenStates = {} for s in states: if s in game.actions.keys(): returns[s] = [] seenStates.update({s : 0}) else: V[s] = 0 for t in range(0,10000): sAr = playGame(game, policy, gamma, windy, eps) for s, g in sAr: if s in game.actions.keys(): if seenStates[s] == 0: returns[s].append(g) V[s] = np.mean(returns[s]) seenStates.update({s : 1}) print_values(V, game)
def test_td0_approximation_method(alpha=0.1, gamma=0.9, eps=0.1, itterations=10000): game = StandardGrid() print_values(game.rewards, game) policy = { ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left (0, 0): [0, 0, 1, 0], (0, 1): [1, 0, 0, 0], (0, 2): [1, 0, 0, 0], (1, 0): [0, 0, 1, 0], (1, 2): [0, 0, 1, 0], (2, 0): [0, 0, 1, 0], (2, 1): [0, 0, 1, 0], #(2, 1): [0, 0, 1 , 0], (2, 2): [0, 0, 1, 0], #(2, 2): [0, 0, 1 , 0], (3, 2): [1, 0, 0, 0] } print("Policy:") print_policy_beautifly(policy, game) states = game.allStates() # Let's create a state model = LinearState() deltas = [] ## Debug only for t in xrange(itterations): s_and_rs = play_game(game, policy, False, eps) biggest_change = 0 alpha = alpha / 1.0001 for i in xrange(len(s_and_rs) - 1): state, _ = s_and_rs[i] s_plus_one, reward = s_and_rs[i + 1] if game.isTerminal(s_plus_one): target = reward else: target = reward + gamma * model.predict(s_plus_one) #This is for debug only old_weights = model.weights.copy() # This is gradient step similar to below #V[state] = V[state] + alpha *(reward + gamma*V[s_plus_one] - V[state]) model.weights += alpha * ( target - model.predict(state)) * model.gradient(state) biggest_change = max(biggest_change, np.abs(old_weights - model.weights).sum()) deltas.append(biggest_change) # We need to recreate the V V = {} for s in states: print(s) if s in game.actions: V[s] = model.predict(s) else: # terminal state or state we can't otherwise get to V[s] = 0 print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game) plt.plot(deltas) plt.show()
def test_td0_sarsa_approximated(alpha=0.1, gamma=0.9, eps=0.1, itterations=50000): game = StandardGrid() #NegativeGrid(-0.1) print_values(game.rewards, game) allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left) states = game.allStates() t = 1.0 deltas = [] model = LinearSarsaState() for j in xrange(itterations): ## Learning rate change if j % 1000 == 0: t += 10e-3 if j % 1000 == 0: print(".") max_change = 0 state = game.start game.position = state game_over = False Qs = model.getQs(state) action = max_dict(Qs)[0] action = eps_random_action(action, eps=0.5 / t) alpha = alpha / 1.00005 while not game_over: r = game.move(action) state2 = game.position game_over = game.gameOver() Qs2 = model.getQs(state2) ## Let's find possible next actions action2 = max_dict(Qs2)[0] action2 = eps_random_action(action2, eps=0.5 / t) # epsilon-greedy ## At this stage we have all SARSA, let's calculate Q old_weights = model.weights.copy() model.weights += alpha * ( r + gamma * model.predict(state2, action2) - model.predict(state, action)) * model.gradient(state, action) max_change = max(max_change, np.abs(model.weights - old_weights).sum()) action = action2 state = state2 deltas.append(max_change) policy = {} V = {} Q = {} for s in game.actions.keys(): Qs = model.getQs(s) Q[s] = Qs a, max_q = max_dict(Qs) policy[s] = [0, 0, 0, 0] policy[s][a] = 1 V[s] = max_q print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game) plt.plot(deltas) plt.show()