def test_gridworld_q_learning():
    np.random.seed(0)

    N = 5
    goal_pos = np.array([[N-1, N-1]])
    human_pos = np.array([[N-1, 0]])
    human_radius = 2

    grid = np.ones((N, N), dtype=float) * -1
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=0.8,
        render=True,
    )

    mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    mdp_algo.run()
    policy = StochasticGreedyPolicy(
        env.action_space(), mdp_algo, env.transition)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
def test_gridworld_value_iteration():
    np.random.seed(0)

    N = 10
    goal_pos = np.array([[N-1, N-1], [N-1, N-2]])
    human_pos = np.array([[N//2, N//2], [N-1, 0]])
    human_radius = 3

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=True,
    )

    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = EpsGreedyPolicy(env.action_space(), mdp_algo)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plot_policy(policy, (N, N), "Policy", values=V, cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
Beispiel #3
0
def main(cfg):
    pygame.init()

    # フォントの作成
    sysfont = pygame.font.SysFont(None, 40)
    screen = pygame.display.set_mode(WINDOW_SIZE)
    pygame.display.set_caption("Grid World")

    done = False

    clock = pygame.time.Clock()

    # grid worldの初期化
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    agent = QLearningAgent(
        epsilon=cfg["agent"]["epsilon"],
        epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"],
        actions=np.arange(4),
        observation=ini_state)  # Q学習エージェント

    nb_episode = cfg["nb_episode"]  # エピソード数
    save_interval = cfg["save_interval"]
    result_dir = cfg["result_dir"]
    max_step = 1
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?

    step = 0
    # time.sleep(30)

    for episode in range(nb_episode):
        print("episode:", episode)
        episode_reward = []  # 1エピソードの累積報酬
        step = 0
        while (is_end_episode is False and step < max_step):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)

            screen.fill(BLACK)
            # grid worldの描画
            draw_grid_world(grid_env.map, screen)
            # テキストを描画したSurfaceを作成
            step_str = sysfont.render("step:{}".format(step), False, WHITE)
            # 位# テキストを描画する
            screen.blit(step_str, (500, 50))
            clock.tick(1)
            step += 1

            # 再描画
            pygame.display.flip()

        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  # 初期化
        agent.observe(state)  # エージェントを初期位置に
        is_end_episode = False
        print("step:", step)
        agents = [agent]

        if episode % save_interval == 0:
            save_result(agents, episode, result_dir)

    pygame.quit()
Beispiel #4
0
        learning_rate=0.0025,
        name='dqn_' + str(i),
        gamma=0.9,
        buffer_size=500, batch_size=10, target_change_step=10, min_buffer_size=100,
        load_path=None,
        save_path=None
    )

    for episode in range(EPISODES):

        observation = env.state
        observation_oh = env.one_hot(env.state)
        episode_reward = 0
        while True:
            action = Agent.act(observation_oh, env.A[observation])
            state, reward, done = env.step(action)
            state_oh = env.one_hot(state)
            Agent.record(observation_oh, action, reward, state_oh, done)
            observation = state
            observation_oh = state_oh
            episode_reward += reward

            if done:
                reward_list.append(episode_reward)
                env.reset()
                Agent.learn()
                if episode % 100 == 0:
                    print('Reward for agent %d, episode %d, is %f' % (i, episode, np.mean(reward_list[-10::])))
                    f_list = []
                    for test_state in range(env.state_space):
                        if test_state in env.w:
Beispiel #5
0
from grid_world import GridWorld

gw = GridWorld(num_rows=4, num_columns=4)
gw.current_state = 14
print(gw.step('RIGHT'))
 for i in range(num_episodes + 1):
     episode_buffer = ExperienceBuffer()
     observation = env.reset()
     observation = np.reshape(observation, [21168])
     done = False
     total_reward_in_episode = 0
     steps_in_episode = 0
     while steps_in_episode < max_episode_length:
         # 积累样本
         steps_in_episode += 1
         steps += 1
         if np.random.rand(1) < random_threshold or steps < pre_train_steps:
             action = np.random.randint(0, 4)
         else:
             action = sess.run(main_DQN.action, feed_dict={main_DQN.scalar_input: [observation]})[0]
         new_observation, reward, done = env.step(action)
         new_observation = np.reshape(new_observation, [21168])
         #Save the experience to episode buffer
         episode_buffer.add([[observation, action, reward, new_observation, done]])
         
         total_reward_in_episode += reward
         observation = new_observation
         # 积累了 pre_train_steps/max_episode_length 次经历的pre_train_steps个样本之后才会第一次开始衰减随机门限,进行第一次训练。
         if steps >= pre_train_steps:
             if random_threshold > random_lower_bound:
                 random_threshold -= drop_step
             if steps % (update_freq) == 0:
                 train_batch = global_experience_buffer.sample(batch_size)
                 #Below we perform the Double-DQN update to the target Q-values
                 observations = np.vstack([record[0] for record in train_batch])
                 actions = np.array([record[1] for record in train_batch])
Beispiel #7
0
from policy import EpsGreedyQPolicy
from grid_world import GridWorld

if __name__ == '__main__':
    grid_env = GridWorld() # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    policy = EpsGreedyQPolicy(epsilon=.01) # 方策の初期化。ここではε-greedy
    agent = QLearningAgent(actions=np.arange(4), observation=ini_state, policy=policy) # Q Learning エージェントの初期化
    nb_episode = 100   #エピソード数
    rewards = []    # 評価用報酬の保存
    is_goal = False # エージェントがゴールしてるかどうか?
    for episode in range(nb_episode):
        episode_reward = [] # 1エピソードの累積報酬
        while(is_goal == False):    # ゴールするまで続ける
            action = agent.act()    # 行動選択
            state, reward, is_goal = grid_env.step(action)
            agent.observe(state, reward)   # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える
        state = grid_env.reset()    #  初期化
        agent.observe(state)    # エージェントを初期位置に
        is_goal = False

    # テスト(greedyアクション)
    agent.traning = False
    while(is_goal == False):    # ゴールするまで続ける
        print("(y, x):{}".format(state))
        action = agent.act()    # 行動選択
        print(action)
        state, reward, is_goal = grid_env.step(action)
        agent.observe(state, reward)   # 状態と報酬の観測
Beispiel #8
0
from qlearning_agent import QLearningAgent
from grid_world import GridWorld

if __name__ == '__main__':
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    agent = QLearningAgent(epsilon=.1,
                           actions=np.arange(4),
                           observation=ini_state)  # Q学習エージェント
    nb_episode = 1000  #エピソード数
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?
    for episode in range(nb_episode):
        episode_reward = []  # 1エピソードの累積報酬
        while (is_end_episode == False):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  #  初期化
        agent.observe(state)  # エージェントを初期位置に
        is_end_episode = False

    # 結果のプロット
    plt.plot(np.arange(nb_episode), rewards)
    plt.xlabel("episode")
    plt.ylabel("reward")
    plt.savefig("result.jpg")
    plt.show()