def main():
    # 이미지 저장 경로 확인 및 생성
    if not os.path.exists('images/'):
        os.makedirs('images/')

    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    values, returns = first_visit_mc_prediction(env, 1.0, 10000)
    print("First Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j]))
        print()

    draw_grid_world_image(values, 'images/first_visit_mc_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)
    print()

    values, returns = every_visit_mc_prediction(env, 1.0, 10000)
    print("Every Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}): {2:5.2f}".format(i, j, values[i, j]))
        print()

    draw_grid_world_image(values, 'images/every_visit_mc_state_values.png',
                          GRID_HEIGHT, GRID_WIDTH)
Example #2
0
def main():
    # 이미지 저장 경로 확인 및 생성
    if not os.path.exists('images/'):
        os.makedirs('images/')

    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    # n-스텝
    n = 3

    # 스텝 사이즈
    alpha = 0.2

    # 가 수행당 1번의 에피소드 수행
    episodes = 1000

    values = dict()
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            values[(i, j)] = 0.0

    for ep in range(episodes):
        temporal_difference(env, values, n, alpha)

    draw_grid_world_image(values, 'images/grid_world_fixed_params.png',
                          GRID_HEIGHT, GRID_WIDTH)
Example #3
0
def make_environment(environment_params):
    if environment_params['env_name'] == 'gridworld':
        from environments.gridworld import GridWorld
        env = GridWorld(environment_params['grid_size'])
    else:
        env = Environment(
            environment_params['env_name'],
            grid_size=environment_params['grid_size'],
            last_n=environment_params['last_n'],
            delta_preprocessing=environment_params['delta_preprocessing'])
    return env
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)

    runs = 10
    step_n = [1, 2, 4, 8, 16]
    data = np.zeros(shape=(len(step_n), MAX_EPISODES))

    for run in range(runs):
        print("RUNS: {0}".format(run))
        for idx_n, n in enumerate(step_n):
            Q = state_action_value(env)
            policy = generate_e_greedy_policy(env, EPSILON, Q)

            print("n={0} ".format(n), end=" ")

            _, episode_reward_list = n_step_sarsa(env, Q, policy, n)

            avg_episode_reward_list = []
            for episode in range(MAX_EPISODES):
                avg_episode_reward_list.append(
                    episode_reward_list[max(0, episode - 10):(episode +
                                                              1)].mean())

            for idx in range(MAX_EPISODES):
                data[idx_n, idx] += avg_episode_reward_list[idx]

        print()

    data[:, :] /= runs

    marker = ['o', 'x', '.', 's', '*', '+', '|', '^', 'D', ' ']
    for idx_n, n in enumerate(step_n):
        plt.plot(range(0, MAX_EPISODES, 5),
                 data[idx_n, ::5],
                 marker=marker[idx_n],
                 label='n = {0}'.format(step_n[idx_n]))

    plt.xlabel('에피소드')
    plt.ylabel('에피소드별 평균 리워드')
    plt.legend()

    plt.savefig('images/n_step_sarsa.png')
    plt.close()
Example #5
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    MC = OffPolicyMonteCarloControl(env)
    MC.off_policy_control()

    print_grid_world_policy(env, MC.target_policy)
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)

    Q = state_action_value(env)
    policy = generate_e_greedy_policy(env, EPSILON, Q)
    n_step_sarsa(env, Q, policy, 4)

    # print policy
    print_grid_world_policy(env, policy)
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)

    MC = MonteCarloControl(env)
    MC.exploring_start_control()

    with np.printoptions(precision=2, suppress=True):
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                print(i, j, ": UP, DOWN, LEFT, RIGHT", MC.policy[(i, j)][1])
            print()
def grid_world_policy_evaluation():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)

    env.reset()

    # 수렴 시킨 상태 가치를 이미지로 저장하고 반복 횟수 반환 받음
    state_values, iteration = compute_state_value(env)

    print('정책 평가 --> 상태 가치 수렴: {} 회 반복'.format(iteration))
    print(state_values)

    draw_grid_world_image(np.round(state_values, decimals=2),
                          'images/state_values.png', GRID_HEIGHT, GRID_WIDTH)
Example #9
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)

    Q = state_action_value(env)

    # EPSILON-Greedy 정책 생성
    policy = generate_e_greedy_policy(env, EPSILON, Q)

    # 무작위 정책 생성
    b = generate_random_policy(env)

    policy, episode_reward_list = n_step_off_policy_td(env, Q, policy, b, 2)

    # print policy
    print_grid_world_policy(env, policy)
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=None,  # exploring start
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0)
    env.reset()

    MC = SoftPolicyMonteCarloControl(env)
    MC.soft_policy_control()

    with np.printoptions(precision=2, suppress=True):
        for i in range(GRID_HEIGHT):
            for j in range(GRID_WIDTH):
                print(
                    i, j, ": UP, DOWN, LEFT, RIGHT", MC.policy[(i, j)][1],
                    env.action_space.ACTION_SYMBOLS[np.argmax(
                        MC.target_policy[(i, j)][1])])
            print()
Example #11
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    state_action_values, returns = first_visit_mc_prediction(env, 1.0, 10000)
    print("First Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}):".format(i, j))
            for action in range(NUM_ACTIONS):
                print("  Action {0}: {1:5.2f}".format(
                    env.action_space.ACTION_SYMBOLS[action],
                    state_action_values[((i, j), action)]))
        print()

    print()

    state_action_values, returns = every_visit_mc_prediction(env, 1.0, 10000)
    print("Every Visit")
    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}):".format(i, j))
            for action in range(NUM_ACTIONS):
                print("  Action {0}: {1:5.2f}".format(
                    env.action_space.ACTION_SYMBOLS[action],
                    state_action_values[((i, j), action)]))
        print()

    print()
Example #12
0
def main():
    # 그리드 월드 환경 객체 생성
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=(0, 0),
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0)
    env.reset()

    MC = OffPolicyMonteCarloPrediction(env)
    MC.off_policy_prediction()

    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print("({0}, {1}):".format(i, j))
            for action in range(NUM_ACTIONS):
                print("  Action {0}: {1:5.2f}".format(
                    env.action_space.ACTION_SYMBOLS[action],
                    MC.state_action_values[((i, j), action)]))
        print()

    print()
Example #13
0
        iter_num_policy_evaluation = self.value_evaluation()
        print("*** 가치 평가 [수렴까지 누적 반복 횟수: {0}] ***".format(iter_num_policy_evaluation))

        self.policy_setup()
        print("*** 정책 셋업 완료 ***")

        return self.state_values, self.policy


if __name__ == '__main__':
    # 그리드 월드 환경 객체 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=(0, 0),
        terminal_states=TERMINAL_STATES,
        transition_reward=-1.0,
        terminal_reward=-1.0,
        outward_reward=-1.0
    )
    env.reset()

    VI = ValueIteration(env)
    VI.start_iteration()
    print(VI.state_values, end="\n\n")

    for i in range(GRID_HEIGHT):
        for j in range(GRID_WIDTH):
            print(i, j, VI.policy[i][j])  # UP, DOWN, LEFT, RIGHT
        print()

# MAIN
if __name__ == '__main__':
    if not os.path.exists('images/'):
        os.makedirs('images/')

    value_function = np.zeros((GRID_HEIGHT, GRID_WIDTH))
    draw_grid_world_image(np.round(value_function, decimals=0), 'images/empty_grid_world.png', GRID_HEIGHT, GRID_WIDTH)

    # 5x5 맵 생성
    env = GridWorld(
        height=GRID_HEIGHT,
        width=GRID_WIDTH,
        start_state=(0, 0),
        terminal_states=[],
        transition_reward=0,
        outward_reward=-1.0,
        warm_hole_states=[(A_POSITION, A_PRIME_POSITION, 10.0), (B_POSITION, B_PRIME_POSITION, 5.0)]
    )

    env.reset()
    state_values = grid_world_state_values(env)
    print(state_values)
    draw_grid_world_image(
        np.round(state_values, decimals=2), 'images/grid_world_state_values.png', GRID_HEIGHT, GRID_WIDTH
    )

    print()

    env.reset()
            marker = '+'
            color = 'green'

        plt.plot(step_sizes,
                 performace[method, :],
                 linestyle=linestyle,
                 color=color,
                 marker=marker,
                 label=label)

    plt.xlabel('스텝 사이즈 (alpha)')
    plt.ylabel('에피소드 당 누적 보상')
    plt.legend()

    plt.savefig('images/cumulative_rewards_for_step_size.png')
    plt.close()


if __name__ == '__main__':
    env = GridWorld(height=GRID_HEIGHT,
                    width=GRID_WIDTH,
                    start_state=START_STATE,
                    terminal_states=TERMINAL_STATES,
                    transition_reward=-1.0,
                    terminal_reward=-1.0,
                    outward_reward=-1.0,
                    warm_hole_states=[(s, START_STATE, -100.0)
                                      for s in CLIFF_STATES])
    cumulative_rewards_for_episodes(env)
    cumulative_rewards_for_step_size(env)
Example #16
0
from environments.gridworld import GridWorld
from agents.agent import Agent
from controls.control import Sarsa, QLearning, ExpectedSarsa
from utils.train import Train
from utils.comparator import Comparator
from policies.action_value import TabularActionValue

# Train an agent in a episodic gridworld with windy tiles using Sarsa, QLearning
# and ExpectedSarsa control algorithms
trainings = []
controls = [Sarsa(), QLearning(), ExpectedSarsa()]
for control in controls:
    game = GridWorld(level=1)
    action_value = TabularActionValue(game.get_states(), game.get_actions())
    agent = Agent(game, action_value)
    train = Train(agent, game, control)
    train.train()
    trainings.append(train)

# Compare the episodic rewards obtained by the agent during training for the
# different control algorithms
comp = Comparator(*trainings, smoothing=30)
comp.compare_rewards()
Example #17
0
from environments.gridworld import GridWorld, State
from lonr.local_no_regret import LocalNoRegret

grid = GridWorld()
lonr = LocalNoRegret(time_limit=15000,
                     num_agents=1,
                     state_space=grid.state_space,
                     actions=grid.actions,
                     next_states=grid.next_states,
                     transitions=grid.transitions,
                     rewards=grid.rewards,
                     gamma=0.99)
lonr.lonr_v()
Example #18
0
import os
print(os.getcwd())

# Set parameters
n_rows = 10
n_cols = 10
discount_rate = 0.4
random_shift = 0.3
goal_state_index = [12, 45]
n_steps = 10
n_demos = 100

# Initialise framework
env = GridWorld(n_rows,
                n_cols,
                discount_rate,
                random_shift=0,
                goal_state_index=goal_state_index)
mdp_solver = LinearMDP(env)

# Solve for optimal and get demonstrations
demonstrations = run_optimal(env,
                             n_rows,
                             n_cols,
                             n_steps,
                             n_demos,
                             mdp_solver,
                             show=False)

algo = DeepIRL(env, mdp_solver)
state_values, q_values, policy, log_policy, rewards_vector = algo.run(