Exemple #1
0
def main():
    # 이미지 저장 경로 확인 및 생성
    if not os.path.exists('images/'):
        os.makedirs('images/')

    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    # n-스텝
    n = 3

    # 스텝 사이즈
    alpha = 0.2

    # 가 수행당 1번의 에피소드 수행
    episodes = 1000

    values = dict()
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            values[(i, j)] = 0.0

    for ep in range(episodes):
        temporal_difference(env, values, n, alpha)

    draw_grid_world_image(values, 'images/grid_world_fixed_params.png',
                          GRID_HEIGHT, GRID_WIDTH)
def main():
    # 이미지 저장 경로 확인 및 생성
    if not os.path.exists('images/'):
        os.makedirs('images/')

    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    values, returns = first_visit_mc_prediction(env, 1.0, 10000)
    print("First Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j]))
        print()

    draw_grid_world_image(values, 'images/first_visit_mc_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)
    print()

    values, returns = every_visit_mc_prediction(env, 1.0, 10000)
    print("Every Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j]))
        print()

    draw_grid_world_image(values, 'images/every_visit_mc_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)
Exemple #3
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    MC = OffPolicyMonteCarloControl(env)
    MC.off_policy_control()

    print_grid_world_policy(env, MC.target_policy)
def grid_world_policy_evaluation():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)

    env.reset()

    # 수렴 시킨 상태 가치를 이미지로 저장하고 반복 횟수 반환 받음
    state_values, iteration = compute_state_value(env)

    print('정책 평가 --> 상태 가치 수렴: {} 회 반복'.format(iteration))
    print(state_values)

    draw_grid_world_image(np.round(state_values, decimals=2),
                          'images/state_values.png', GRID_HEIGHT, GRID_WIDTH)
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    MC = SoftPolicyMonteCarloControl(env)
    MC.soft_policy_control()

    with np.printoptions(precision=2, suppress=True):
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                print(
                    i, j, ": UP, DOWN, LEFT, RIGHT", MC.policy[(i, j)][1],
                    env.action_space.ACTION_SYMBOLS[np.argmax(
                        MC.target_policy[(i, j)][1])])
            print()
Exemple #6
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    MC = OffPolicyMonteCarloPrediction(env)
    MC.off_policy_prediction()

    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}):".format(i, j))
            for action in range(NUM_ACTIONS):
                print("  Action {0}: {1:5.2f}".format(
                    env.action_space.ACTION_SYMBOLS[action],
                    MC.state_action_values[((i, j), action)]))
        print()

    print()
Exemple #7
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    state_action_values, returns = first_visit_mc_prediction(env, 1.0, 10000)
    print("First Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}):".format(i, j))
            for action in range(NUM_ACTIONS):
                print("  Action {0}: {1:5.2f}".format(
                    env.action_space.ACTION_SYMBOLS[action],
                    state_action_values[((i, j), action)]))
        print()

    print()

    state_action_values, returns = every_visit_mc_prediction(env, 1.0, 10000)
    print("Every Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}):".format(i, j))
            for action in range(NUM_ACTIONS):
                print("  Action {0}: {1:5.2f}".format(
                    env.action_space.ACTION_SYMBOLS[action],
                    state_action_values[((i, j), action)]))
        print()

    print()
Exemple #8
0
        iter_num_policy_evaluation = self.value_evaluation()
        print("*** 가치 평가 [수렴까지 누적 반복 횟수: {0}] ***".format(iter_num_policy_evaluation))

        self.policy_setup()
        print("*** 정책 셋업 완료 ***")

        return self.state_values, self.policy


if __name__ == '__main__':
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=(0, 0),
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0
    )
    env.reset()

    VI = ValueIteration(env)
    VI.start_iteration()
    print(VI.state_values, end="\n\n")

    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print(i, j, VI.policy[i][j])  # UP, DOWN, LEFT, RIGHT
        print()