Esempio n. 1
0
def test_gridworld_q_learning():
    np.random.seed(0)

    N = 5
    goal_pos = np.array([[N-1, N-1]])
    human_pos = np.array([[N-1, 0]])
    human_radius = 2

    grid = np.ones((N, N), dtype=float) * -1
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=0.8,
        render=True,
    )

    mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    mdp_algo.run()
    policy = StochasticGreedyPolicy(
        env.action_space(), mdp_algo, env.transition)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
Esempio n. 2
0
def test_gridworld_value_iteration():
    np.random.seed(0)

    N = 10
    goal_pos = np.array([[N-1, N-1], [N-1, N-2]])
    human_pos = np.array([[N//2, N//2], [N-1, 0]])
    human_radius = 3

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=True,
    )

    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = EpsGreedyPolicy(env.action_space(), mdp_algo)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plot_policy(policy, (N, N), "Policy", values=V, cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
Esempio n. 3
0
def main(cfg):
    pygame.init()

    # フォントの作成
    sysfont = pygame.font.SysFont(None, 40)
    screen = pygame.display.set_mode(WINDOW_SIZE)
    pygame.display.set_caption("Grid World")

    done = False

    clock = pygame.time.Clock()

    # grid worldの初期化
    grid_env = GridWorld()  # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    agent = QLearningAgent(
        epsilon=cfg["agent"]["epsilon"],
        epsilon_decay_rate=cfg["agent"]["epsilon_decay_rate"],
        actions=np.arange(4),
        observation=ini_state)  # Q学習エージェント

    nb_episode = cfg["nb_episode"]  # エピソード数
    save_interval = cfg["save_interval"]
    result_dir = cfg["result_dir"]
    max_step = 1
    rewards = []  # 評価用報酬の保存
    is_end_episode = False  # エージェントがゴールしてるかどうか?

    step = 0
    # time.sleep(30)

    for episode in range(nb_episode):
        print("episode:", episode)
        episode_reward = []  # 1エピソードの累積報酬
        step = 0
        while (is_end_episode is False and step < max_step):  # ゴールするまで続ける
            action = agent.act()  # 行動選択
            state, reward, is_end_episode = grid_env.step(action)
            agent.observe(state, reward)  # 状態と報酬の観測
            episode_reward.append(reward)

            screen.fill(BLACK)
            # grid worldの描画
            draw_grid_world(grid_env.map, screen)
            # テキストを描画したSurfaceを作成
            step_str = sysfont.render("step:{}".format(step), False, WHITE)
            # 位# テキストを描画する
            screen.blit(step_str, (500, 50))
            clock.tick(1)
            step += 1

            # 再描画
            pygame.display.flip()

        rewards.append(np.sum(episode_reward))  # このエピソードの平均報酬を与える
        state = grid_env.reset()  # 初期化
        agent.observe(state)  # エージェントを初期位置に
        is_end_episode = False
        print("step:", step)
        agents = [agent]

        if episode % save_interval == 0:
            save_result(agents, episode, result_dir)

    pygame.quit()
Esempio n. 4
0
        observation = env.state
        observation_oh = env.one_hot(env.state)
        episode_reward = 0
        while True:
            action = Agent.act(observation_oh, env.A[observation])
            state, reward, done = env.step(action)
            state_oh = env.one_hot(state)
            Agent.record(observation_oh, action, reward, state_oh, done)
            observation = state
            observation_oh = state_oh
            episode_reward += reward

            if done:
                reward_list.append(episode_reward)
                env.reset()
                Agent.learn()
                if episode % 100 == 0:
                    print('Reward for agent %d, episode %d, is %f' % (i, episode, np.mean(reward_list[-10::])))
                    f_list = []
                    for test_state in range(env.state_space):
                        if test_state in env.w:
                            continue
                        else:
                            test_state_oh = env.one_hot(test_state)
                            action = Agent.act(test_state_oh, env.A[test_state], test_mode=True)
                            if action in np.where(env.Optimal[test_state])[0]:
                                f_list.append(1)
                            else:
                                f_list.append(0)
                    fidelity = np.mean(f_list)
 if load_model == True:
     print('Loading Model...')
     ckpt = tf.train.get_checkpoint_state(path)
     saver.restore(sess,ckpt.model_checkpoint_path)
 sess.run(init_op)
 
 #Set the target network to be equal to the main network.
 update_target(update_target_ops_2, sess)
 global_experience_buffer = ExperienceBuffer()
 
 #create list to contain total rewards per episode
 total_reward_list = []
 steps = 0
 for i in range(num_episodes + 1):
     episode_buffer = ExperienceBuffer()
     observation = env.reset()
     observation = np.reshape(observation, [21168])
     done = False
     total_reward_in_episode = 0
     steps_in_episode = 0
     while steps_in_episode < max_episode_length:
         # 积累样本
         steps_in_episode += 1
         steps += 1
         if np.random.rand(1) < random_threshold or steps < pre_train_steps:
             action = np.random.randint(0, 4)
         else:
             action = sess.run(main_DQN.action, feed_dict={main_DQN.scalar_input: [observation]})[0]
         new_observation, reward, done = env.step(action)
         new_observation = np.reshape(new_observation, [21168])
         #Save the experience to episode buffer
Esempio n. 6
0
    grid_env = GridWorld() # grid worldの環境の初期化
    ini_state = grid_env.start_pos  # 初期状態(エージェントのスタート地点の位置)
    policy = EpsGreedyQPolicy(epsilon=.01) # 方策の初期化。ここではε-greedy
    agent = QLearningAgent(actions=np.arange(4), observation=ini_state, policy=policy) # Q Learning エージェントの初期化
    nb_episode = 100   #エピソード数
    rewards = []    # 評価用報酬の保存
    is_goal = False # エージェントがゴールしてるかどうか?
    for episode in range(nb_episode):
        episode_reward = [] # 1エピソードの累積報酬
        while(is_goal == False):    # ゴールするまで続ける
            action = agent.act()    # 行動選択
            state, reward, is_goal = grid_env.step(action)
            agent.observe(state, reward)   # 状態と報酬の観測
            episode_reward.append(reward)
        rewards.append(np.sum(episode_reward)) # このエピソードの平均報酬を与える
        state = grid_env.reset()    #  初期化
        agent.observe(state)    # エージェントを初期位置に
        is_goal = False

    # テスト(greedyアクション)
    agent.traning = False
    while(is_goal == False):    # ゴールするまで続ける
        print("(y, x):{}".format(state))
        action = agent.act()    # 行動選択
        print(action)
        state, reward, is_goal = grid_env.step(action)
        agent.observe(state, reward)   # 状態と報酬の観測

    # 結果のプロット
    plt.plot(np.arange(nb_episode), rewards)
    plt.xlabel("episode")