def test_td0_method(alpha=0.1, gamma=0.9, eps=0.1): game = StandardGrid() print_values(game.rewards, game) policy = { ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left (0, 0): [0, 0, 1, 0], (0, 1): [1, 0, 0, 0], (0, 2): [1, 0, 0, 0], (1, 0): [0, 0, 1, 0], (1, 2): [0, 0, 1, 0], (2, 0): [0, 0, 1, 0], (2, 1): [0, 0, 1, 0], (2, 2): [0, 0, 1, 0], (3, 2): [1, 0, 0, 0] } print("Policy:") print_policy_beautifly(policy, game) V = {} states = game.allStates() for s in states: V[s] = 0 for t in xrange(100000): s_and_rs = play_game(game, policy, False, eps) for i in xrange(len(s_and_rs) - 1): state, _ = s_and_rs[i] s_plus_one, reward = s_and_rs[i + 1] V[state] = V[state] + alpha * (reward + gamma * V[s_plus_one] - V[state]) print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game)
def test_td0_approximation_method(alpha=0.1, gamma=0.9, eps=0.1, itterations=10000): game = StandardGrid() print_values(game.rewards, game) policy = { ## GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left (0, 0): [0, 0, 1, 0], (0, 1): [1, 0, 0, 0], (0, 2): [1, 0, 0, 0], (1, 0): [0, 0, 1, 0], (1, 2): [0, 0, 1, 0], (2, 0): [0, 0, 1, 0], (2, 1): [0, 0, 1, 0], #(2, 1): [0, 0, 1 , 0], (2, 2): [0, 0, 1, 0], #(2, 2): [0, 0, 1 , 0], (3, 2): [1, 0, 0, 0] } print("Policy:") print_policy_beautifly(policy, game) states = game.allStates() # Let's create a state model = LinearState() deltas = [] ## Debug only for t in xrange(itterations): s_and_rs = play_game(game, policy, False, eps) biggest_change = 0 alpha = alpha / 1.0001 for i in xrange(len(s_and_rs) - 1): state, _ = s_and_rs[i] s_plus_one, reward = s_and_rs[i + 1] if game.isTerminal(s_plus_one): target = reward else: target = reward + gamma * model.predict(s_plus_one) #This is for debug only old_weights = model.weights.copy() # This is gradient step similar to below #V[state] = V[state] + alpha *(reward + gamma*V[s_plus_one] - V[state]) model.weights += alpha * ( target - model.predict(state)) * model.gradient(state) biggest_change = max(biggest_change, np.abs(old_weights - model.weights).sum()) deltas.append(biggest_change) # We need to recreate the V V = {} for s in states: print(s) if s in game.actions: V[s] = model.predict(s) else: # terminal state or state we can't otherwise get to V[s] = 0 print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game) plt.plot(deltas) plt.show()
def testForRandomPolicyEvaluationNegativeGrid(): print("Negative Grid Random Policy Testing!") gamma = 0.7 g = NegativeGrid(-0.50) p = RandomPolicy.createRandomPolicy(g.actions, g.width, g.height) print("Before itterative improvement:") print("========================================") print_policy_beautifly(p, g) print_policy(p, g) for i in range(0, 1000): v = evaluateValueFunction(g, p, gamma=gamma) p = itteratePolicy(g, v, p, gamma=gamma) print("After improvement:") print_policy_beautifly(p, g) print_policy(p, g) print_values(v, g)
def testForDetermenisticPolicyEvaluation(): print("Standart Grid Deterministic Policy Testing!") gamma = 0.5 g = StandardGrid() p2 = RandomPolicy.createRandomDeterministicPolicy(g.actions, g.width, g.height) print("Before itterative improvement:") print("========================================") print_policy_beautifly(p2, g) print_policy(p2, g) print("========================================") for i in range(0, 100): V2 = evaluateValueFunction(g, p2, gamma=gamma) p2 = itterateDetermenisticPolicy(g, V2, p2, gamma=gamma) print("After improvement:") print_policy_beautifly(p2, g) print_policy(p2, g) print_values(V2, g)
def testToEvaluateValue(): gamma = 1 g = StandardGrid() p1 = generateRandomPolicy() V1 = evaluateValueFunction(g, p1, gamma) print("========================================") print_policy(p1, g) print_policy_beautifly(p1, g) print("========================================") print_values(V1, g) ##### g = StandardGrid() p2 = generateDeterministicPolicyTwo() V2 = evaluateValueFunction(g, p2, gamma) print("========================================") print_policy(p2, g) print_policy_beautifly(p2, g) print("========================================") print_values(V2, g)
def testForValueItteration(): # Last algorithm of the chapter, but the most efficient one # Itterating Values to get the policy automatically print("Negative Grid Value Itteration Testing!") gamma = 0.9 bigNegative = -10000 allActions = [ GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left ] g = NegativeGrid(-0.50) #p = generateDeterministicPolicyRandomly() p = generateDeterministicPolicyRandomly(g) print("Before itterative improvement:") print("========================================") print_policy_beautifly(p, g) print_policy(p, g) V = {} states = g.allStates() for s in states: if s in g.actions: V[s] = np.random.random() else: V[s] = 0 convergenceMet = False while not convergenceMet: biggest_change = 0 for s in states: old_v = V[s] # Policy already reduced to all possible actions if s in p: new_v = bigNegative for a in allActions: g.setPosition(s) r = g.move(a) v = r + gamma * V[g.position] if v > new_v: new_v = v V[s] = new_v biggest_change = max(biggest_change, np.abs(old_v - V[s])) if biggest_change < SMALL_ENOUGH: convergenceMet = True # Building optimum policy for s in p.keys(): best_a = None best_value = bigNegative # loop through all possible actions # to find the best current action for the state for a in allActions: g.setPosition(s) r = g.move(a) v = r + gamma * V[g.position] if v > best_value: best_value = v best_a = a p[s][a] = 0.0 # Best gets all p[s][best_a] = 1.0 print("After Value itteration improvement:") print("========================================") print_policy_beautifly(p, g) print_policy(p, g)
def test_td0_sarsa(alpha=0.1, gamma=0.9, eps=0.1): game = NegativeGrid() print_values(game.rewards, game) allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left) states = game.allStates() Q = {} Q_visists = {} Q_visists_debug = {} for s in states: Q[s] = {} Q_visists[s] = {} Q_visists_debug[s] = 0 for a in allActions: Q[s][a] = 0 Q_visists[s][a] = 1.0 t = 1.0 deltas = [] for j in xrange(100000): ## Learning rate change if j % 100 == 0: t += 10e-3 if j % 1000 == 0: print(".") max_change = 0 state = game.start game.position = state game_over = False action, _ = max_dict(Q[s]) action = eps_random_action(action, eps=0.5 / t) while not game_over: r = game.move(action) state2 = game.position game_over = game.gameOver() ## Let's find possible next actions action2, _ = max_dict(Q[state2]) action2 = eps_random_action(action2, eps=0.5 / t) # epsilon-greedy ## At this stage we have all SARSA, let's calculate Q betta = alpha / Q_visists[state][action] Q_visists[state][action] += 0.001 ## 0.005 old_qsa = Q[state][action] Q[state][action] = Q[state][action] + betta * ( r + gamma * Q[state2][action2] - Q[state][action]) max_change = max(max_change, abs(Q[state][action] - old_qsa)) Q_visists_debug[state] = Q_visists_debug.get(state, 0) + 1 action = action2 state = state2 deltas.append(max_change) plt.plot(deltas) plt.show() policy = {} V = {} for s in game.actions.keys(): a, max_q = max_dict(Q[s]) policy[s] = [0, 0, 0, 0] policy[s][a] = 1.0 V[s] = max_q print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game)
def test_td0_q_learning(itterations=1000, alpha=0.1, gamma=0.9, eps=0.5, delta_t=10e-3, learning_decay=0.0001): game = NegativeGrid(-0.2) #StandardGrid() print_values(game.rewards, game) allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left) states = game.allStates() Q = {} Q_visists = {} Q_visists_debug = {} for s in states: Q[s] = {} Q_visists[s] = {} Q_visists_debug[s] = 0 for a in allActions: Q[s][a] = 0 Q_visists[s][a] = 1.0 t = 1.0 deltas = [] for j in xrange(itterations): ## Learning rate change if j % 100 == 0: t += delta_t if j % 1000 == 0: print(".") max_change = 0 state = game.start game.position = state game_over = False action = max_dict(Q[s])[0] while not game_over: ## The fact that we use e-greedy makes or actions non optimal during the play ## However this allows us to be off-policy ## That makes Q-LEarning - an OFF-Policy algorithm action = eps_random_action(action, eps=eps / t) r = game.move(action) state2 = game.position game_over = game.gameOver() ## Let's find possible next actions action2, q_max_a2 = max_dict(Q[state2]) ## At this stage we have all Q-Learning params betta = alpha / Q_visists[state][action] Q_visists[state][action] += learning_decay old_qsa = Q[state][action] ### So we are taking Q* here despite not taking the best move as a future move. Q[state][action] = Q[state][action] + betta * ( r + gamma * q_max_a2 - Q[state][action]) max_change = max(max_change, abs(Q[state][action] - old_qsa)) Q_visists_debug[state] = Q_visists_debug.get(state, 0) + 1 action = action2 state = state2 deltas.append(max_change) plt.plot(deltas) plt.show() policy = {} V = {} for s in game.actions.keys(): a, max_q = max_dict(Q[s]) policy[s] = [0, 0, 0, 0] policy[s][a] = 1.0 V[s] = max_q print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game)
def test_td0_sarsa_approximated(alpha=0.1, gamma=0.9, eps=0.1, itterations=50000): game = StandardGrid() #NegativeGrid(-0.1) print_values(game.rewards, game) allActions = (GridWorldMove.up, GridWorldMove.down, GridWorldMove.right, GridWorldMove.left) states = game.allStates() t = 1.0 deltas = [] model = LinearSarsaState() for j in xrange(itterations): ## Learning rate change if j % 1000 == 0: t += 10e-3 if j % 1000 == 0: print(".") max_change = 0 state = game.start game.position = state game_over = False Qs = model.getQs(state) action = max_dict(Qs)[0] action = eps_random_action(action, eps=0.5 / t) alpha = alpha / 1.00005 while not game_over: r = game.move(action) state2 = game.position game_over = game.gameOver() Qs2 = model.getQs(state2) ## Let's find possible next actions action2 = max_dict(Qs2)[0] action2 = eps_random_action(action2, eps=0.5 / t) # epsilon-greedy ## At this stage we have all SARSA, let's calculate Q old_weights = model.weights.copy() model.weights += alpha * ( r + gamma * model.predict(state2, action2) - model.predict(state, action)) * model.gradient(state, action) max_change = max(max_change, np.abs(model.weights - old_weights).sum()) action = action2 state = state2 deltas.append(max_change) policy = {} V = {} Q = {} for s in game.actions.keys(): Qs = model.getQs(s) Q[s] = Qs a, max_q = max_dict(Qs) policy[s] = [0, 0, 0, 0] policy[s][a] = 1 V[s] = max_q print("Policy:") print_policy_beautifly(policy, game) print("Values:") print_values(V, game) plt.plot(deltas) plt.show()