def main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() # n-스텝 n = 3 # 스텝 사이즈 alpha = 0.2 # 가 수행당 1번의 에피소드 수행 episodes = 1000 values = dict() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): values[(i, j)] = 0.0 for ep in range(episodes): temporal_difference(env, values, n, alpha) draw_grid_world_image(values, 'images/grid_world_fixed_params.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 이미지 저장 경로 확인 및 생성 if not os.path.exists('images/'): os.makedirs('images/') # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() values, returns = first_visit_mc_prediction(env, 1.0, 10000) print("First Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j])) print() draw_grid_world_image(values, 'images/first_visit_mc_state_values.png', GRID_HEIGHT, GRID_WIDTH) print() values, returns = every_visit_mc_prediction(env, 1.0, 10000) print("Every Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j])) print() draw_grid_world_image(values, 'images/every_visit_mc_state_values.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() MC = OffPolicyMonteCarloControl(env) MC.off_policy_control() print_grid_world_policy(env, MC.target_policy)
def grid_world_policy_evaluation(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() # 수렴 시킨 상태 가치를 이미지로 저장하고 반복 횟수 반환 받음 state_values, iteration = compute_state_value(env) print('정책 평가 --> 상태 가치 수렴: {} 회 반복'.format(iteration)) print(state_values) draw_grid_world_image(np.round(state_values, decimals=2), 'images/state_values.png', GRID_HEIGHT, GRID_WIDTH)
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=None, # exploring start terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() MC = SoftPolicyMonteCarloControl(env) MC.soft_policy_control() with np.printoptions(precision=2, suppress=True): for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print( i, j, ": UP, DOWN, LEFT, RIGHT", MC.policy[(i, j)][1], env.action_space.ACTION_SYMBOLS[np.argmax( MC.target_policy[(i, j)][1])]) print()
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() MC = OffPolicyMonteCarloPrediction(env) MC.off_policy_prediction() for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}):".format(i, j)) for action in range(NUM_ACTIONS): print(" Action {0}: {1:5.2f}".format( env.action_space.ACTION_SYMBOLS[action], MC.state_action_values[((i, j), action)])) print() print()
def main(): # 그리드 월드 환경 객체 생성 env = GridWorld(height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0) env.reset() state_action_values, returns = first_visit_mc_prediction(env, 1.0, 10000) print("First Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}):".format(i, j)) for action in range(NUM_ACTIONS): print(" Action {0}: {1:5.2f}".format( env.action_space.ACTION_SYMBOLS[action], state_action_values[((i, j), action)])) print() print() state_action_values, returns = every_visit_mc_prediction(env, 1.0, 10000) print("Every Visit") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print("({0}, {1}):".format(i, j)) for action in range(NUM_ACTIONS): print(" Action {0}: {1:5.2f}".format( env.action_space.ACTION_SYMBOLS[action], state_action_values[((i, j), action)])) print() print()
iter_num_policy_evaluation = self.value_evaluation() print("*** 가치 평가 [수렴까지 누적 반복 횟수: {0}] ***".format(iter_num_policy_evaluation)) self.policy_setup() print("*** 정책 셋업 완료 ***") return self.state_values, self.policy if __name__ == '__main__': # 그리드 월드 환경 객체 생성 env = GridWorld( height=GRID_HEIGHT, width=GRID_WIDTH, start_state=(0, 0), terminal_states=TERMINAL_STATES, transition_reward=-1.0, terminal_reward=-1.0, outward_reward=-1.0 ) env.reset() VI = ValueIteration(env) VI.start_iteration() print(VI.state_values, end="\n\n") for i in range(GRID_HEIGHT): for j in range(GRID_WIDTH): print(i, j, VI.policy[i][j]) # UP, DOWN, LEFT, RIGHT print()