def policy_iteration(T, r, tot_states=12, gamma=0.999, epsilon=0.01): iteration = 0 # инициализируем случайную политику p = np.random.randint(0, 4, size=tot_states).astype(np.float32) # в одно состояние мы не можем попасть p[5] = np.NaN # в терминальных состояниях ничего не делаем p[3] = p[7] = -1 # начальное состояние вектора ценностей состояний u = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) while True: iteration += 1 # 1 - Получаем оценки ценностей состояния для текущей политики u_0 = u.copy() u = return_policy_evaluation(p, u, r, T, gamma) # Проверяем, может нам пора остановиться delta = np.absolute(u - u_0).max() if delta < epsilon * (1 - gamma) / gamma: break for s in range(tot_states): if not np.isnan(p[s]) and not p[s] == -1: v = np.zeros((1, tot_states)) v[0, s] = 1.0 # 2 - для текущих оценок ценностей состояний получаем оптимальное действие и обновляем политику p[s] = return_expected_action(u, T, v) print_policy(p, shape=(3, 4)) print_result(u, iteration, delta, gamma, epsilon) return p, u
def main(): grid = standard_grid(obey_prob=1.0, step_cost=None) print_values(grid.rewards, grid) V, Policy, Deltas = monte_carlo(grid) print_values(V, grid) print_policy(Policy, grid) plt.plot(Deltas) plt.show()
def visit(inst, s, solved, values): # TODO: add your code here. # Make use of compute_greedy_action_and_value, sample_successor, and # check_solved. # Return updated labeling solved and updated value function values. """ Run the LRTDP algorithm until it converges. """ def lrtdp(inst, values): solved = { s: False for s in inst.states } iteration = 1 while not solved[inst.init]: wait_for_input("Press enter for another iteration of LRTDP...".format(iteration)) solved, values = visit(inst, inst.init, solved, values) print("Values after iteration {}: ".format(iteration)) print_values(inst, values) print("Solved after iteration {}: ".format(iteration)) print_solved(inst, solved) iteration += 1 return values if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( 'algorithm', choices=['rtdp', 'lrtdp'], help="Choose the algorithm." ) args = parser.parse_args() inst = instance.get_example_instance() print(inst) values = { s : heuristic(inst, s) for s in inst.states } print("") print("Initial state-values:") print_values(inst, values) if args.algorithm == 'rtdp': values = rtdp(inst, values) elif args.algorithm == 'lrtdp': values = lrtdp(inst, values) else: sys.exit("Unknown algorithm") print("") print("Final values:") print_values(inst, values) policy = get_greedy_policy(inst, values) print("Corresponding final policy:") print_policy(inst, policy)
def main(): grid = standard_grid(obey_prob=1.0, step_cost=None) # print rewards print("rewards:") print_values(grid.rewards, grid) V, policy, deltas = monte_carlo(grid) print("final values:") print_values(V, grid) print("final policy:") print_policy(policy, grid) plt.plot(deltas) plt.show()
def main(): # Матрица вероятностей переходов s -> s' при совержении действия a. p(s, s', a) = T[s, s', a] T = np.load("T.npy") # Вектор вознаграждений получаемых агентов в каждом состоянии r = np.array([ -0.04, -0.04, -0.04, +1.0, -0.04, 0.0, -0.04, -1.0, -0.04, -0.04, -0.04, -0.04 ]) # запускаем алгоритм Value Iteration и находим вектор ценности состояний u = value_iteration(T, r) # запускаем алгоритм Policy Iteration и находим вектор ценности состояний p, u = policy_iteration(T, r) print_policy(p, shape=(3, 4))
Q[s][a] = np.mean(returns[sa]) biggest_change = max(biggest_change, np.abs(old_q - Q[s][a])) seen_state_action_pairs.add(sa) deltas.append(biggest_change) for s in policy.keys(): a, _ = max_dict(Q[s]) policy[s] = a V = {} for s in policy.keys(): V[s] = max_dict(Q[s])[1] return V, policy, deltas if __name__ == '__main__': grid = standard_grid(obey_prob=1.0, step_cost=None) print("rewards:") print_values(grid.rewards, grid) V, policy, deltas = monte_carlo(grid) print("final values:") print_values(V, grid) print("final policy:") print_policy(policy, grid) plt.plot(deltas) plt.show()
# append the terminal state memory.append((observation_, action, reward)) returns = 0 last = True # start at t = T - 1 for state, action, reward in reversed(memory): if last: last = False else: states_actions_returns.append((state, action, returns)) returns = DISCOUNT * returns + reward states_actions_returns.reverse() states_and_actions = [] for state, action, returns in states_actions_returns: if (state, action) not in states_and_actions: PAIRS_VISITED[(state, action)] += 1 RETURNS[(state, action)] += ((1 / PAIRS_VISITED[(state, action)]) * (returns - RETURNS[(state, action)])) ESTIMATES[(state, action)] = RETURNS[(state, action)] states_and_actions.append((state, action)) values = np.array( [ESTIMATES[(state, a)] for a in GRID.possible_actions]) best = np.argmax(values) POLICY[state] = GRID.possible_actions[best] print_estimates(ESTIMATES, GRID) print_policy(POLICY, GRID)
for t in range(1, N): if t%1000 == 0: print(t) s = (2, 0) grid.set_state(s) a, _ = greedy_from(Q[s]) a = random_action(a, eps=0.1) while not grid.game_over(): a = random_action(a, eps=(1.0/t)) r = grid.move(a) s_prime = grid.current_state() a_prime, _ = greedy_from(Q[s_prime]) q_sa = Q[s][a] num_seen_sa[(s, a)] += 1 # print('Learning Rate: ', ALPHA/num_seen_sa[(s, a)]) Q[s][a] = q_sa + (ALPHA/num_seen_sa[(s, a)]) * (r + GAMMA*Q[s_prime][a_prime] - q_sa) delta = np.abs(Q[s][a] - q_sa) deltas.append(delta) s = s_prime a = a_prime pi = {} for s in S: if not grid.is_terminal(s): pi[s], _ = greedy_from(Q[s]) print('Results:') print_policy(pi, grid) print(len(deltas)) plt.plot(deltas) plt.show()
observation = observation_ memory.append((observation, action, reward)) returns = 0 relative_probability = 1 last = True for (state, action, reward) in reversed(memory): if last: last = False else: C_ESTIMATES[(state, action)] += relative_probability ESTIMATES[(state, action)] += ( (relative_probability / C_ESTIMATES[(state, action)]) * (returns-ESTIMATES[(state, action)]) ) vals = np.array([ ESTIMATES[(state, a)] for a in GRID.possible_actions ]) argmax = np.argmax(vals) TARGET_POLICY[state] = GRID.possible_actions[argmax] if action != TARGET_POLICY[state]: break if len(behavior_policy[state]) == 1: prob = 1 - EPSILON else: prob = EPSILON / len(behavior_policy[state]) relative_probability *= 1 / prob returns = DISCOUNT * returns + reward print_policy(TARGET_POLICY, GRID)
def calculate_greedy_policy(grid, V): P = dict() for state in grid.non_terminal_states(): P[state] = np.random.choice(ALL_POSSIBLE_ACTIONS) for state in grid.non_terminal_states(): best_action, _ = best_action_value(grid, V, state) P[state] = best_action return P if __name__ == '__main__': grid = standard_grid(obey_prob=0.8, step_cost=None) # print rewards print("rewards:") print_values(grid.rewards, grid) # calculate accurate values for each square V = calculate_values(grid) # calculate the optimum policy based on our values P = calculate_greedy_policy(grid, V) # our goal here is to verify that we get the same answer as with policy iteration print("values:") print_values(V, grid) print("policy:") print_policy(P, grid)
# Convert state path: path_1d = gw.convert_state_log_2d_to_1d(path_2d) dem_paths.append(path_1d) utils.print_value(v_states) # Create a grid to show the optimal policy: # 0: stay, 1: north, 2:east, 3: south, 4: west grid_pol = copy.copy(gw.grid) for s_i in range(grid_pol.shape[0]): for s_j in range(grid_pol.shape[1]): s_1d = gw.convert_state_2d_to_1d((s_i, s_j)) a_opt = policy_opt[s_1d] grid_pol[s_i, s_j] = a_opt utils.print_policy(grid_pol, gw.actions_2d) # # Create a grid to show the optimal policy: # # 0: stay, 1: north, 2:east, 3: south, 4: west # grid_pol = copy.copy(gw.grid); # for s_i in range(grid_pol.shape[0]): # for s_j in range(grid_pol.shape[1]): # s_1d = gw.convert_state_2d_to_1d((s_i, s_j)); # a_opt = policy_opt[s_1d]; # grid_pol[s_i,s_j] = a_opt; # for path_1d in dem_paths: # grid_path = copy.copy(gw.grid); # steps = 0; # for state_1d in path_1d:
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('algorithm', choices=['rtdp', 'lrtdp'], help="Choose the algorithm.") args = parser.parse_args() inst = instance.get_example_instance() print(inst) values = {s: heuristic(inst, s) for s in inst.states} print("") print("Initial state-values:") print_values(inst, values) if args.algorithm == 'rtdp': values = rtdp(inst, values) elif args.algorithm == 'lrtdp': values = lrtdp(inst, values) else: sys.exit("Unknown algorithm") print("") print("Final values:") print_values(inst, values) policy = get_greedy_policy(inst, values) print("Corresponding final policy:") print_policy(inst, policy)