Beispiel #1
0
def main():
    # 初始化 环境
    # 冰湖环境
    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)

    # 悬崖环境
    env = gym.make("CliffWalking-v0")
    env = CliffWalkingWapper(env)

    # 初始化 Agent
    agent = SarsaAgent(obs_n=env.observation_space.n,
                       act_n=env.action_space.n,
                       learning_rate=0.1,
                       gamma=0.9,
                       e_greed=0.1)

    # 开始训练
    render = False
    for episode in range(500):
        ep_steps, ep_reward = run_episode(env, agent, render)
        print('Episode %s: steps = %s , reward = %.1f' %
              (episode, ep_steps, ep_reward))
        # 每隔 20 个 episode 看一下效果
        if episode % 20 == 0:
            render = True
        else:
            render = False

    # 训练结束,看一下效果
    test_episode(env, agent)
Beispiel #2
0
def main():
    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)

    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)

    agent = SarsaAgent(obs_n=env.observation_space.n,
                       act_n=env.action_space.n,
                       learning_rate=0.1,
                       gamma=0.9,
                       e_greed=0.1)

    is_render = False
    for episode in range(500):
        ep_reward, ep_steps = run_episode(env, agent, is_render)
        print('Episode %s: steps = %s , reward = %.1f' %
              (episode, ep_steps, ep_reward))

        # 每隔20个episode渲染一下看看效果
        if episode % 20 == 0:
            is_render = True
        else:
            is_render = False
    # 训练结束,查看算法效果
    test_episode(env, agent)
Beispiel #3
0
def train(cfg):
    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)
    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)
    agent = QLearning(obs_dim=env.observation_space.n,
                      action_dim=env.action_space.n,
                      learning_rate=cfg.policy_lr,
                      gamma=cfg.gamma,
                      epsilon_start=cfg.epsilon_start,
                      epsilon_end=cfg.epsilon_end,
                      epsilon_decay=cfg.epsilon_decay)
    render = False  # 是否打开GUI画面
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps
    for i_episode in range(1, cfg.max_episodes + 1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.sample(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            # 训练 Q-learning算法
            agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action

            obs = next_obs  # 存储上一个观察值
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if render:
                env.render()  #渲染新的一帧图形
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward)
        print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' %
              (i_episode, ep_steps, ep_reward, agent.epsilon))
        # 每隔20个episode渲染一下看看效果
        if i_episode % 20 == 0:
            render = True
        else:
            render = False
    agent.save()  # 训练结束,保存模型

    output_path = os.path.dirname(__file__) + "/result/"
    # 检测是否存在文件夹
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    np.save(output_path + "rewards_train.npy", rewards)
    np.save(output_path + "MA_rewards_train.npy", MA_rewards)
    np.save(output_path + "steps_train.npy", steps)
Beispiel #4
0
def test(cfg):

    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)
    agent = QLearning(obs_dim=env.observation_space.n,
                      action_dim=env.action_space.n,
                      learning_rate=cfg.policy_lr,
                      gamma=cfg.gamma,
                      epsilon_start=cfg.epsilon_start,
                      epsilon_end=cfg.epsilon_end,
                      epsilon_decay=cfg.epsilon_decay)
    agent.load()  # 导入保存的模型
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps
    for i_episode in range(1, 10 + 1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.predict(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            obs = next_obs  # 存储上一个观察值
            time.sleep(0.5)
            env.render()
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward)
        print('Episode %s: steps = %s , reward = %.1f' %
              (i_episode, ep_steps, ep_reward))
    plt.plot(MA_rewards)
    plt.show()
Beispiel #5
0
            action = np.random.choice(action_list)  # 最大值随机选取
        return action

    def learn(self, state, action, reward, next_state, next_action, done):
        predict_Q = self.Q_TABLE[state, action]  # Q_table表找到对应Q的评估值
        if done:
            target_Q = reward
        else:
            # Q <- Q + a*[(R + y*next_Q) - Q] Q_TABLE预测 max_action(Q(s',a))
            target_Q = reward + GAMMA * np.max(self.Q_TABLE[next_state,:])
        self.Q_TABLE[state, action] = self.Q_TABLE[state, action] + LEARNING_RATE * (target_Q - predict_Q)


if __name__ == '__main__':
    env = gym.make("CliffWalking-v0")
    env = CliffWalkingWapper(env)
    dim_state = env.observation_space.n  # 48
    dim_action = env.action_space.n  # 4
    agent = QLearningAgent(dim_state, dim_action)

    for epoch in range(500):
        state = env.reset()  # 开始一局游戏

        total_rewards, total_steps = 0, 0
        action = agent.choose_action(state)
        while True:
            #env.render()
            next_state, reward, done, _ = env.step(action)  # 采取该行为获取下一个state 及分数
            next_action = agent.choose_action(next_state)  # 行为 概率
            agent.learn(state, action, reward, next_state, next_action, done)
            action = next_action