def main(): env = standard_grid() qs = monte_carlo_control(standard_grid, epsilon_soft_greedy) render_qs_policy(env, qs) print() env = negative_grid() qs = monte_carlo_control(negative_grid, epsilon_soft_greedy) render_qs_policy(env, qs)
def main(): print('Standard Grid') env = standard_grid() v_star = value_iteration(env) render_vs(env, v_star) render_policy(env, policy_from_v(env, v_star)) print('Negative Grid:') env = negative_grid() v_star = value_iteration(env) render_vs(env, v_star) render_policy(env, policy_from_v(env, v_star))
def main(): print('Standard Grid') env = standard_grid() policy, vs = policy_iteration(env) render_vs(env, vs) render_policy(env, policy) print('Negative Grid:') env = negative_grid() policy, vs = policy_iteration(env) render_vs(env, vs) render_policy(env, policy)
import numpy as np import matplotlib.pyplot as plt from grid_world import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy SMALL_ENOUGH = 10e-4 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') # this is deterministic # all p(s',r|s,a) = 1 or 0 if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = negative_grid() # print rewards print("rewards:") print_values(grid.rewards, grid) # state -> action # we'll randomly choose an action and update as we learn policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # initial policy print("initial policy:") print_policy(policy, grid)
if env.is_terminal(env.current_state()): target = reward else: target = reward + discount * next_est # Update current state theta = theta + alpha * (target - c_est) * x return theta def semi_gradient_td(create_env, policy, episodes=100000): theta = np.random.randn(4) / 2 for _ in range(episodes): theta = td_episode(create_env, policy, theta) return theta if __name__ == '__main__': env = standard_grid() theta = semi_gradient_td(standard_grid, eps_win_policy) vs = get_value(env, theta, preprocess_features) render_vs(env, vs) print() env = negative_grid() theta = semi_gradient_td(negative_grid, eps_win_policy) vs = get_value(env, theta, preprocess_features) render_vs(env, vs)
def random_action(a, eps=0.1): p = np.random.random() # returns a random float from [0,1) if p < (1 - eps): return a else: return np.random.choice(all_possible_actions) # We then start the code. We do not have a playgame function, as playing the game and doing the updates cannot be separate # The updates need to be done while playing the game if __name__ == '__main__': grid = negative_grid( step_cost=-0.1) #we want to penalize each agent movement # We then initialize Q(s,a) Q = {} states = grid.all_states() for s in states: Q[s] = {} # we initialize as dictionary, as this also acts as policy for a in all_possible_actions: Q[s][a] = 0 # We also keep track of how oftern Q[s] has been updated. # This is needed for updating the learning rate update_counts = {} update_counts_sa = {} for s in states:
ALPHA = 2.0 ALL_POSSIBLE_ACTIONS = ['U', 'D', 'L', 'R'] def greedy_from(Qs): best_action = None best_value = float('-inf') for action in Qs: value = Qs[action] if value > best_value: best_action = action best_value = value return best_action, best_value if __name__ == '__main__': grid = negative_grid(-0.5) print('rewards') print_values(grid.rewards, grid) S = grid.all_states() Q = {} num_seen_sa = {} for s in S: Q[s] = {} for a in ALL_POSSIBLE_ACTIONS: num_seen_sa[(s, a)] = 0 if grid.is_terminal(s): Q[s][a] = 0 else: Q[s][a] = np.random.rand() N = 10000 deltas = []
# suboptimal policies # e.g. # --------------------------- # R | R | R | | # --------------------------- # R* | | U | | # --------------------------- # U | R | U | L | # since going R at (1,0) (shown with a *) incurs no cost, it's OK to keep doing that. # we'll either end up staying in the same spot, or back to the start (2,0), at which # point we whould then just go back up, or at (0,0), at which point we can continue # on right. # instead, let's penalize each movement so the agent will find a shorter route. # # grid = standard_grid() grid = negative_grid(step_cost=-0.1) # print rewards print "rewards:" print_values(grid.rewards, grid) # no policy initialization, we will derive our policy from most recent Q # enumerate all (s,a) pairs, each will have its own weight in our "dumb" model # essentially each weight will be a measure of Q(s,a) itself states = grid.all_states() for s in states: SA2IDX[s] = {} for a in ALL_POSSIBLE_ACTIONS: SA2IDX[s][a] = IDX IDX += 1
import numpy as np from grid_world import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy SMALL_ENOUGH = 10e-4 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') if __name__ == "__main__": grid = negative_grid(-.1) print("rewards: ") print_values(grid.rewards, grid) states = grid.all_state() print("\ninitial initalization: ") policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print_policy(policy, grid) # Value Initialization V = {} for s in states: V[s] = 0 while True: delta = 0 for s in states: old_Vs = V[s]
action_value = reward + self.discount * self.state_values[ next_state] if action_value > best_action_value: best_action_value = action_value best_action = a self.state_values[s] += p_a * action_value self.env.undo_move(a) self.policy[s] = best_action deltas[i] = np.abs(self.state_values[s] - old_value) t += 1 print("Number of iterations: ", t) grid_world.print_policy(self.policy, self.env) grid_world.print_values(self.state_values, self.env) if __name__ == "__main__": optimizer = PolicyOptimizer(environment=grid_world.negative_grid()) grid_world.print_policy(optimizer.policy, optimizer.env) grid_world.print_values(optimizer.state_values, optimizer.env) optimizer.perform_value_iteration(wind=None) # Windy Gridworld: each action has a 50% chance to fail, another action (chosen at random) is performed instead optimizer = PolicyOptimizer(environment=grid_world.negative_grid()) grid_world.print_policy(optimizer.policy, optimizer.env) grid_world.print_values(optimizer.state_values, optimizer.env) optimizer.perform_value_iteration( wind='right', wind_force=0.26) # .25 is the threshold to switch optimal agency
import numpy as np from grid_world import standard_grid, negative_grid from visualize import print_values, print_policies from q_study_functional import action_eps_greedy, get_max_Q, get_Qs from model import Model GAMMA = 0.9 ALPHA = 0.1 ALL_POSSIBLE_ACTIONS = ('R', 'L', 'U', 'D') if __name__ == "__main__": grid = negative_grid(step_cost=-50) print("rewards:") print_values(grid.rewards, grid.width, grid.height) model = Model() t = 1.0 for it in range(30000): if it % 100 == 0: t += 1e-2 if it % 1000 == 0: print("it: {}".format(it)) # 0. initialize condition alpha = ALPHA / t current_grid = (3, 0) grid.set_initial_grid(current_grid) start_flag = True Qs = get_Qs(model, current_grid, ALL_POSSIBLE_ACTIONS) while not grid.game_over(): if start_flag is True: # 1. choose random action action = np.random.choice(ALL_POSSIBLE_ACTIONS)
import numpy as np from grid_world import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy SMALL_ENOUGH = 1e-3 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') # next state and reward will now have some randomness # you'll go in your desired direction with probability 0.5 # you'll go in a random direction a' != a with probability 0.5/3 if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = negative_grid(step_cost=-1.0) # grid = negative_grid(step_cost=-0.1) # grid = standard_grid() # print rewards print "rewards:" print_values(grid.rewards, grid) # state -> action # we'll randomly choose an action and update as we learn policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # initial policy print "initial policy:"
import numpy as np import matplotlib.pyplot as plt from grid_world import standard_grid, negative_grid from utils import print_policy, print_values ALL_POSSIBLE_ACTIONS = ['U', 'D', 'L', 'R'] SMALL_ENOUGH = 10e-4 GAMMA = 0.9 if __name__ == '__main__': # random initalize V and pi V = {} grid = negative_grid(-1.901) print('Rewards:') print_values(grid.rewards, grid) S = grid.all_states() for state in S: V[state] = 0 pi = {} for state in grid.actions.keys(): pi[state] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('Inital Policy') print_policy(pi, grid) print('Inital Vs') print_values(V, grid) iteration = 0 while True:
def grad(self, s, a): return self.sa2x(s, a) def getQs(model, s): #we need Q9s,a) to choose an action Qs = {} for a in ACTIONS: q_sa = model.predict(s, a) Qs[a] = q_sa return Qs if __name__ == '__main__': grid = negative_grid(step_cost=-0.1) # rewards print('rewards') print_values(grid.rewards, grid) #no policy initialization states = grid.all_states() for s in states: SA2IDX[s] = {} for a in ACTIONS: SA2IDX[s][a] = IDX IDX += 1 #initialize model
r = grid.move(other_action) mean_v += p * (r + gamma * steps_values[grid.current_state()]) if mean_v > best_val: best_val = mean_v new_action = candidate_action if new_action != current_action: improved_policy[s] = new_action is_converged = False return is_converged, improved_policy if __name__ == "__main__": # grid = standard_grid() grid = negative_grid(-1.0) print_values(grid.rewards, grid) # Step 1: initialize policy and state values policy = dict( zip(grid.actions.keys(), np.random.choice(ALL_POSSIBLE_ACTIONS, len(grid.actions.keys())))) V = defaultdict(int) for s in grid.actions: V[s] = np.random.random() print("Initialized values:") print_values(V, grid) print("Initialized pplicy:") print_policy(policy, grid)
import numpy as np from grid_world import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy threshold = 1e-3 gamma = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') # Add some randomness if __name__ == "__main__": grid = negative_grid(step_cost=-1.0) print('Rewards:') print_values(grid.rewards, grid) print('') policy = {} # Randomly choose action and update as we learn for state in grid.actions: policy[state] = np.random.choice(ALL_POSSIBLE_ACTIONS) print('Initial Policy:') print_policy(policy, grid) V = {} states = grid.all_states() for state in states: # V(s) = 0 if state in grid.actions: V[state] = np.random.random()
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python import numpy as np from grid_world import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy SMALL_ENOUGH = 1e-3 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') # this is deterministic # all p(s',r|s,a) = 1 or 0 if __name__ == '__main__': # this grid gives you a reward of -0.1 for every non-terminal state # we want to see if this will encourage finding a shorter path to the goal grid = negative_grid() # print rewards print "rewards:" print_values(grid.rewards, grid) # state -> action # we'll randomly choose an action and update as we learn policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) # initial policy print "initial policy:" print_policy(policy, grid)
# Glenn Thomas # 2020-06-01 # TD (Temporal differcing examples) import grid_world as gw import numpy as np import td import dynamic_programming_functions as dp import matplotlib.pyplot as plt # Set up g = gw.negative_grid(step_reward=-0.1) g.windy = 0.5 # Set up policies fixed_policy = { (2, 0): 'U', (1, 0): 'U', (0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 2): 'R', (2, 1): 'R', (2, 2): 'R', (2, 3): 'U', } actions = g.actions_array for key, value in fixed_policy.items(): probs = np.zeros(len(actions)) probs[np.isin(actions, value)] = 1 / len(value) fixed_policy[key] = probs
import numpy as np import matplotlib.pyplot as plt from grid_world import standard_grid, negative_grid from iterative_policy_evaluation import print_values, print_policy SMALL_ENOUGH = 10e-4 GAMMA = 0.9 ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R') if __name__ == '__main__': grid = negative_grid(step_cost=-1.0) # try standard print "rewards:" print_values(grid.rewards, grid) policy = {} for s in grid.actions.keys(): policy[s] = np.random.choice(ALL_POSSIBLE_ACTIONS) print "initial policy:" print_policy(policy, grid) V = {} states = grid.all_states() for s in states: if s in grid.actions: V[s] = np.random.random() else: # terminal state V[s] = 0