def main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() values, returns = first_visit_mc_prediction(env, 1.0, 10000) print("First Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j])) print() draw_grid_world_image(values, 'images/first_visit_mc_state_values.png', GRID_HEIGHT, GRID_WIDTH) print() values, returns = every_visit_mc_prediction(env, 1.0, 10000) print("Every Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j])) print() draw_grid_world_image(values, 'images/every_visit_mc_state_values.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() # n-스텝 n = 3 # 스텝 사이즈 alpha = 0.2 # 가 수행당 1번의 에피소드 수행 episodes = 1000 values = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): values[(i, j)] = 0.0 for ep in range(episodes): temporal_difference(env, values, n, alpha) draw_grid_world_image(values, 'images/grid_world_fixed_params.png', GRID_HEIGHT, GRID_WIDTH)
def make_environment(environment_params): if environment_params['env_name'] == 'gridworld': from environments.gridworld import GridWorld env = GridWorld(environment_params['grid_size']) else: env = Environment( environment_params['env_name'], grid_size=environment_params['grid_size'], last_n=environment_params['last_n'], delta_preprocessing=environment_params['delta_preprocessing']) return env
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) runs = 10 step_n = [1, 2, 4, 8, 16] data = np.zeros(shape=(len(step_n), MAX_EPISODES)) for run in range(runs): print("RUNS: {0}".format(run)) for idx_n, n in enumerate(step_n): Q = state_action_value(env) policy = generate_e_greedy_policy(env, EPSILON, Q) print("n={0} ".format(n), end=" ") _, episode_reward_list = n_step_sarsa(env, Q, policy, n) avg_episode_reward_list = [] for episode in range(MAX_EPISODES): avg_episode_reward_list.append( episode_reward_list[max(0, episode - 10):(episode + 1)].mean()) for idx in range(MAX_EPISODES): data[idx_n, idx] += avg_episode_reward_list[idx] print() data[:, :] /= runs marker = ['o', 'x', '.', 's', '*', '+', '|', '^', 'D', ' '] for idx_n, n in enumerate(step_n): plt.plot(range(0, MAX_EPISODES, 5), data[idx_n, ::5], marker=marker[idx_n], label='n = {0}'.format(step_n[idx_n])) plt.xlabel('에피소드') plt.ylabel('에피소드별 평균 리워드') plt.legend() plt.savefig('images/n_step_sarsa.png') plt.close()
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() MC = OffPolicyMonteCarloControl(env) MC.off_policy_control() print_grid_world_policy(env, MC.target_policy)
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) Q = state_action_value(env) policy = generate_e_greedy_policy(env, EPSILON, Q) n_step_sarsa(env, Q, policy, 4) # print policy print_grid_world_policy(env, policy)
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) MC = MonteCarloControl(env) MC.exploring_start_control() with np.printoptions(precision=2, suppress=True): for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print(i, j, ": UP, DOWN, LEFT, RIGHT", MC.policy[(i, j)][1]) print()
def grid_world_policy_evaluation(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() # 수렴 시킨 상태 가치를 이미지로 저장하고 반복 횟수 반환 받음 state_values, iteration = compute_state_value(env) print('정책 평가 --> 상태 가치 수렴: {} 회 반복'.format(iteration)) print(state_values) draw_grid_world_image(np.round(state_values, decimals=2), 'images/state_values.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) Q = state_action_value(env) # EPSILON-Greedy 정책 생성 policy = generate_e_greedy_policy(env, EPSILON, Q) # 무작위 정책 생성 b = generate_random_policy(env) policy, episode_reward_list = n_step_off_policy_td(env, Q, policy, b, 2) # print policy print_grid_world_policy(env, policy)
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() MC = SoftPolicyMonteCarloControl(env) MC.soft_policy_control() with np.printoptions(precision=2, suppress=True): for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print( i, j, ": UP, DOWN, LEFT, RIGHT", MC.policy[(i, j)][1], env.action_space.ACTION_SYMBOLS[np.argmax( MC.target_policy[(i, j)][1])]) print()
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() state_action_values, returns = first_visit_mc_prediction(env, 1.0, 10000) print("First Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}):".format(i, j)) for action in range(NUM_ACTIONS): print(" Action {0}: {1:5.2f}".format( env.action_space.ACTION_SYMBOLS[action], state_action_values[((i, j), action)])) print() print() state_action_values, returns = every_visit_mc_prediction(env, 1.0, 10000) print("Every Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}):".format(i, j)) for action in range(NUM_ACTIONS): print(" Action {0}: {1:5.2f}".format( env.action_space.ACTION_SYMBOLS[action], state_action_values[((i, j), action)])) print() print()
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() MC = OffPolicyMonteCarloPrediction(env) MC.off_policy_prediction() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}):".format(i, j)) for action in range(NUM_ACTIONS): print(" Action {0}: {1:5.2f}".format( env.action_space.ACTION_SYMBOLS[action], MC.state_action_values[((i, j), action)])) print() print()
iter_num_policy_evaluation = self.value_evaluation() print("*** 가치 평가 [수렴까지 누적 반복 횟수: {0}] ***".format(iter_num_policy_evaluation)) self.policy_setup() print("*** 정책 셋업 완료 ***") return self.state_values, self.policy if __name__ == '__main__': # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0 ) env.reset() VI = ValueIteration(env) VI.start_iteration() print(VI.state_values, end="\n\n") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print(i, j, VI.policy[i][j]) # UP, DOWN, LEFT, RIGHT print()
# MAIN if __name__ == '__main__': if not os.path.exists('images/'): os.makedirs('images/') value_function = np.zeros((GRID_HEIGHT, GRID_WIDTH)) draw_grid_world_image(np.round(value_function, decimals=0), 'images/empty_grid_world.png', GRID_HEIGHT, GRID_WIDTH) # 5x5 맵 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=[], transition_reward=0, outward_reward=-1.0, warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)] ) env.reset() state_values = grid_world_state_values(env) print(state_values) draw_grid_world_image( np.round(state_values, decimals=2), 'images/grid_world_state_values.png', GRID_HEIGHT, GRID_WIDTH ) print() env.reset()
marker = '+' color = 'green' plt.plot(step_sizes, performace[method, :], linestyle=linestyle, color=color, marker=marker, label=label) plt.xlabel('스텝 사이즈 (alpha)') plt.ylabel('에피소드 당 누적 보상') plt.legend() plt.savefig('images/cumulative_rewards_for_step_size.png') plt.close() if __name__ == '__main__': env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=START_STATE, terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0, warm_hole_states=[(s, START_STATE, -100.0) for s in CLIFF_STATES]) cumulative_rewards_for_episodes(env) cumulative_rewards_for_step_size(env)
from environments.gridworld import GridWorld from agents.agent import Agent from controls.control import Sarsa, QLearning, ExpectedSarsa from utils.train import Train from utils.comparator import Comparator from policies.action_value import TabularActionValue # Train an agent in a episodic gridworld with windy tiles using Sarsa, QLearning # and ExpectedSarsa control algorithms trainings = [] controls = [Sarsa(), QLearning(), ExpectedSarsa()] for control in controls: game = GridWorld(level=1) action_value = TabularActionValue(game.get_states(), game.get_actions()) agent = Agent(game, action_value) train = Train(agent, game, control) train.train() trainings.append(train) # Compare the episodic rewards obtained by the agent during training for the # different control algorithms comp = Comparator(*trainings, smoothing=30) comp.compare_rewards()
from environments.gridworld import GridWorld, State from lonr.local_no_regret import LocalNoRegret grid = GridWorld() lonr = LocalNoRegret(time_limit=15000, num_agents=1, state_space=grid.state_space, actions=grid.actions, next_states=grid.next_states, transitions=grid.transitions, rewards=grid.rewards, gamma=0.99) lonr.lonr_v()
import os print(os.getcwd()) # Set parameters n_rows = 10 n_cols = 10 discount_rate = 0.4 random_shift = 0.3 goal_state_index = [12, 45] n_steps = 10 n_demos = 100 # Initialise framework env = GridWorld(n_rows, n_cols, discount_rate, random_shift=0, goal_state_index=goal_state_index) mdp_solver = LinearMDP(env) # Solve for optimal and get demonstrations demonstrations = run_optimal(env, n_rows, n_cols, n_steps, n_demos, mdp_solver, show=False) algo = DeepIRL(env, mdp_solver) state_values, q_values, policy, log_policy, rewards_vector = algo.run(