def value_iteration_main(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() VI = ValueIteration(env) VI.start_iteration() print(VI.state_values) draw_grid_world_state_values_image( VI.state_values, 'images/grid_world_vi_optimal_state_values.png', GRID_HEIGHT, GRID_WIDTH) draw_grid_world_action_values_image( VI.calculate_grid_world_optimal_action_values(), 'images/grid_world_vi_optimal_action_values.png', GRID_HEIGHT, GRID_WIDTH, env.NUM_ACTIONS, env.ACTION_SYMBOLS) draw_grid_world_optimal_policy_image( VI.calculate_optimal_policy(), "images/grid_world_vi_optimal_policy.png", GRID_HEIGHT, GRID_WIDTH, env.ACTION_SYMBOLS)
def action_value_prediction_main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() state_action_values, returns = first_visit_mc_prediction(env, 1.0, 10000) draw_grid_world_action_values_image( state_action_values, 'images/grid_world_mc_action_values_first_visit.png', GRID_HEIGHT, GRID_WIDTH, env.NUM_ACTIONS, env.ACTION_SYMBOLS) state_action_values, returns = every_visit_mc_prediction(env, 1.0, 10000) draw_grid_world_action_values_image( state_action_values, 'images/grid_world_mc_action_values_every_visit.png', GRID_HEIGHT, GRID_WIDTH, env.NUM_ACTIONS, env.ACTION_SYMBOLS)
def td_learning_main(): env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(2, 2), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) compute_state_values(env, alpha=0.05)
def td_batch_comparison_main(): env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(2, 2), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) print("alpha={0}".format(0.01)) mc_td_batch_comparison(env, alpha=0.01) print("alpha={0}".format(0.005)) mc_td_batch_comparison(env, alpha=0.005)
def state_prediction_main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() state_values, returns = first_visit_mc_prediction(env, 1.0, 10000) print("First Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, state_values[i, j])) print() draw_grid_world_state_values_image( state_values, 'images/grid_world_mc_state_values_first_visit.png', GRID_HEIGHT, GRID_WIDTH) print() state_values, returns = every_visit_mc_prediction(env, 1.0, 10000) print("Every Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, state_values[i, j])) print() draw_grid_world_state_values_image( state_values, 'images/grid_world_mc_state_values_every_visit.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 5x5 맵 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, terminal_states=[], transition_reward=0.0, outward_reward=-1.0, warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)]) state_values = calculate_grid_world_state_values(env) draw_grid_world_state_values_image(state_values, 'images/grid_world_state_values.png', GRID_HEIGHT, GRID_WIDTH) with np.printoptions(precision=2, suppress=True): print(state_values)
def exploring_start_control_main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0 ) # 비어있는 상태-가치 함수를 0으로 초기화하며 생성함 state_action_values, returns = generate_initial_q_value_and_return(env) # 초기 임의 정책 생성 policy = generate_initial_random_policy(env) iter_num = 0 print("[[[ MC 제어 반복 시작! ]]]") while iter_num < MAX_EPISODES: iter_num += 1 episode, visited_state_actions = generate_random_episode_for_trajectory(env, policy) print("*** 에피소드 생성 완료 ***") first_visit_mc_prediction(state_action_values, returns, episode, visited_state_actions) print("*** MC 예측 수행 완료 ***") policy, error = generate_greedy_policy(env, state_action_values, policy) print("*** 정책 개선 [에러 값: {0:9.7f}], 총 반복 수: {1} ***\n".format(error, iter_num)) print("[[[ MC 제어 반복 종료! ]]]\n\n") draw_grid_world_policy_image( policy, "images/grid_world_mc_exploring_start_policy.png", GRID_HEIGHT, GRID_WIDTH, env.ACTION_SYMBOLS )
def main(): # 5x5 맵 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, terminal_states=[], transition_reward=0, outward_reward=-1.0, warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)]) optimal_action_values = calculate_grid_world_optimal_action_values(env) draw_grid_world_action_values_image( optimal_action_values, 'images/grid_world_optimal_action_values.png', GRID_HEIGHT, GRID_WIDTH, env.NUM_ACTIONS, env.ACTION_SYMBOLS) print() optimal_policy = calculate_optimal_policy(optimal_action_values) draw_grid_world_optimal_policy_image( optimal_policy, "images/grid_world_optimal_policy.png", GRID_HEIGHT, GRID_WIDTH, env.ACTION_SYMBOLS)