def one_random_step(i, j, sim): A = return_pointwise_A(i, j, sim) nA = np.size(A, 0) chosen_action_id = np.random.randint(low=0, high=nA) action = A[chosen_action_id] iprime, jprime = np.array([i, j]) + action return iprime, jprime
def one_step(i, j, pi, sim): A = return_pointwise_A(i, j, sim) chosen_action_id = pi[i, j] action = A[chosen_action_id] iprime, jprime = np.array([i, j]) + action # checking for the overalps if iprime == jprime: # if the action is allowed; this part can be modified overlap = True else: overlap = False # checking for the swaps swap = False #swap is anyways always false when particles can move simultaneously, i.e. when sim == True if (i == j - 1 and jprime == iprime - 1): swap = True if (i == j + 1 and jprime == iprime + 1): swap = True if (i == 2 and j == 9 and iprime == 9 and jprime == 2): swap = True if (i == 9 and j == 2 and iprime == 2 and jprime == 9): swap = True if (swap == True or overlap == True): action_allowed = False iprime = i jprime = j else: action_allowed = True return iprime, jprime
def one_directed_step(i, j, sim): if (i == 9): i_in_the_hole = True else: i_in_the_hole = False if (j == 9): j_in_the_hole = True else: j_in_the_hole = False A = return_pointwise_A(i, j, sim) rand = np.random.rand() nA = np.size(A, 0) progressive_actions = np.array([]) exist_p_m = exist_p_0 = exist_0_m = False for f**k in range(0, nA): if (np.array_equal(A[f**k], np.array([1, -1]))): exist_p_m = True if (np.array_equal(A[f**k], np.array([1, 0]))): exist_p_0 = True if (np.array_equal(A[f**k], np.array([0, -1]))): exist_0_m = True # the order of the if statements are important here. if (exist_p_0 == True): action = np.array([1, 0]) if (j_in_the_hole): action = np.array([1, -7]) if (exist_0_m == True): action = np.array([0, -1]) if (i_in_the_hole): action = np.array([-7, -1]) if (exist_p_m == True): action = np.array([1, -1]) if ((exist_p_0 or exist_0_m or exist_p_m)): if (rand < 0.2): chosen_action_id = np.random.randint(low=0, high=nA) action = A[chosen_action_id] else: chosen_action_id = np.random.randint(low=0, high=nA) action = A[chosen_action_id] print("action") print(action) iprime, jprime = np.array([i, j]) + action if (iprime == jprime): iprime = i jprime = j print("iprime = " + str(iprime) + " jprime = " + str(jprime)) return iprime, jprime
v = np.zeros(shape=(n, n)) # 3.3 the main iterative loop # Eq. (4.7) of the book: # each step is the Bellman operation for policy evaluation # followed by a policy improvement step = 0 while (step < 500): new_pi = np.zeros(shape=(n, n)) # policy evaluation v = Bellmann_iteration(pi, r, v, gamma, sim) # policy iteration for i in range(0, n): for j in range(0, n): Q_max = -1000 A = return_pointwise_A(i, j, sim) nr_actions = np.size(A, 0) for candidate_action_id in range( 0, nr_actions ): #iterate over all candidate to find the largest Q Q = Q_estimation_for_state_s(i, j, gamma, r, v, candidate_action_id, sim) if Q >= Q_max: Q_max = Q new_pi[i, j] = candidate_action_id pi = new_pi + 0.0 step += 1 if (step % 100 == 1): print("#iteration: " + str(step - 1)) #### 4. printing and saving ####
def one_step(i, j, pi, sim): A = return_pointwise_A(i, j, sim) chosen_action_id = pi[i, j] action = A[chosen_action_id] iprime, jprime = np.array([i, j]) + action return iprime, jprime
# 3.3 the main iterative loop # Eq. (4.7) of the book: # each step is the Bellman operation for policy evaluation # followed by a policy improvement step = 0 while step < nr_iterations: new_pi = np.zeros(shape=(n, n)) # policy evaluation v = Bellmann_iteration(pi, v, gamma) # policy improvement for i in range(0, n): for j in range(0, n): Q_max = -1000 A = return_pointwise_A(i, j) nr_actions = np.size(A, 0) # iterate over all candidate to find the largest Q for action_id in range(0, nr_actions): Q = Q_estimate(i, j, action_id, gamma, v) if Q >= Q_max: Q_max = Q new_pi[i, j] = action_id pi = new_pi + 0.0 plotter(ax, v) step += 1 if (step % 100 == 1): print("#iteration: " + str(step - 1)) simulate(4, 5, pi)