def main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() # n-스텝 n = 3 # 스텝 사이즈 alpha = 0.2 # 가 수행당 1번의 에피소드 수행 episodes = 1000 values = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): values[(i, j)] = 0.0 for ep in range(episodes): temporal_difference(env, values, n, alpha) draw_grid_world_image(values, 'images/grid_world_fixed_params.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() values, returns = first_visit_mc_prediction(env, 1.0, 10000) print("First Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j])) print() draw_grid_world_image(values, 'images/first_visit_mc_state_values.png', GRID_HEIGHT, GRID_WIDTH) print() values, returns = every_visit_mc_prediction(env, 1.0, 10000) print("Every Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j])) print() draw_grid_world_image(values, 'images/every_visit_mc_state_values.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 5x5 맵 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, terminal_states=[], transition_reward=0, outward_reward=-1.0, warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)]) optimal_state_values = calculate_grid_world_optimal_state_values(env) draw_grid_world_image(np.round(optimal_state_values, decimals=2), 'images/grid_world_optimal_state_values.png', GRID_HEIGHT, GRID_WIDTH) with np.printoptions(precision=2, suppress=True): print(optimal_state_values)
def grid_world_policy_evaluation(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() # 수렴 시킨 상태 가치를 이미지로 저장하고 반복 횟수 반환 받음 state_values, iteration = compute_state_value(env) print('정책 평가 --> 상태 가치 수렴: {} 회 반복'.format(iteration)) print(state_values) draw_grid_world_image(np.round(state_values, decimals=2), 'images/state_values.png', GRID_HEIGHT, GRID_WIDTH)
def compute_state_values(env): policy = generate_initial_random_policy(env) state_values = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): state_values[(i, j)] = 0.0 num_episodes = 300 for _ in range(num_episodes): temporal_difference(env, policy, state_values) draw_grid_world_image(state_values, 'images/grid_world_td_prediction_300.png', GRID_HEIGHT, GRID_WIDTH) state_values = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): state_values[(i, j)] = 0.0 num_episodes = 3000 for _ in range(num_episodes): temporal_difference(env, policy, state_values) draw_grid_world_image(state_values, 'images/grid_world_td_prediction_3000.png', GRID_HEIGHT, GRID_WIDTH) state_values = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): state_values[(i, j)] = 0.0 num_episodes = 10000 for _ in range(num_episodes): temporal_difference(env, policy, state_values) draw_grid_world_image(state_values, 'images/grid_world_td_prediction_10000.png', GRID_HEIGHT, GRID_WIDTH)
# 가치 함수 수렴 여부 판단 if np.sum(np.abs(new_value_function - value_function)) < 1e-4: break value_function = new_value_function return new_value_function # MAIN if __name__ == '__main__': if not os.path.exists('images/'): os.makedirs('images/') value_function = np.zeros((GRID_HEIGHT, GRID_WIDTH)) draw_grid_world_image(np.round(value_function, decimals=0), 'images/empty_grid_world.png', GRID_HEIGHT, GRID_WIDTH) # 5x5 맵 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=[], transition_reward=0, outward_reward=-1.0, warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)] ) env.reset() state_values = grid_world_state_values(env) print(state_values)