Example #1
0
def train(cfg):
    '''# env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)'''
    env = env_init_1()
    agent = QLearning(obs_dim=env.observation_space.n,
                      action_dim=env.action_space.n,
                      learning_rate=cfg.policy_lr,
                      gamma=cfg.gamma,
                      epsilon_start=cfg.epsilon_start,
                      epsilon_end=cfg.epsilon_end,
                      epsilon_decay=cfg.epsilon_decay)
    render = False  # 是否打开GUI画面
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps
    for i_episode in range(1, cfg.max_episodes + 1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.sample(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            # 训练 Q-learning算法
            agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action

            obs = next_obs  # 存储上一个观察值
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if render:
                env.render()  #渲染新的一帧图形
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        '''计算滑动平均的reward'''
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward)
        print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' %
              (i_episode, ep_steps, ep_reward, agent.epsilon))
        '''每隔20个episode渲染一下看看效果'''
        if i_episode % 20 == 0:
            render = True
        else:
            render = False
    print('Complete training!')
    save_model(agent, model_path=SAVED_MODEL_PATH)
    '''存储reward等相关结果'''
    save_results(rewards, MA_rewards, tag='train', result_path=RESULT_PATH)
    plot(rewards)
    plot(MA_rewards, ylabel='moving_average_rewards_train')
Example #2
0
def eval(cfg, saved_model_path=SAVED_MODEL_PATH):

    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)
    agent = QLearning(obs_dim=env.observation_space.n,
                      action_dim=env.action_space.n,
                      learning_rate=cfg.policy_lr,
                      gamma=cfg.gamma,
                      epsilon_start=cfg.epsilon_start,
                      epsilon_end=cfg.epsilon_end,
                      epsilon_decay=cfg.epsilon_decay)
    agent.load_model(saved_model_path + 'checkpoint.npy')  # 导入保存的模型
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps
    for i_episode in range(1, 10 + 1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.predict(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            obs = next_obs  # 存储上一个观察值
            time.sleep(0.5)
            env.render()
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward)
        print('Episode %s: steps = %s , reward = %.1f' %
              (i_episode, ep_steps, ep_reward))
    print('Complete training!')
    save_model(agent, model_path=SAVED_MODEL_PATH)
    '''存储reward等相关结果'''
    save_results(rewards, MA_rewards, tag='train', result_path=RESULT_PATH)
    plot(rewards)
    plot(MA_rewards, ylabel='moving_average_rewards_train')
Example #3
0
def train(cfg):
    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)
    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)
    agent = QLearning(obs_dim=env.observation_space.n,
                      action_dim=env.action_space.n,
                      learning_rate=cfg.policy_lr,
                      gamma=cfg.gamma,
                      epsilon_start=cfg.epsilon_start,
                      epsilon_end=cfg.epsilon_end,
                      epsilon_decay=cfg.epsilon_decay)
    render = False  # 是否打开GUI画面
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps
    for i_episode in range(1, cfg.max_episodes + 1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.sample(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            # 训练 Q-learning算法
            agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action

            obs = next_obs  # 存储上一个观察值
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if render:
                env.render()  #渲染新的一帧图形
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward)
        print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' %
              (i_episode, ep_steps, ep_reward, agent.epsilon))
        # 每隔20个episode渲染一下看看效果
        if i_episode % 20 == 0:
            render = True
        else:
            render = False
    agent.save()  # 训练结束,保存模型

    output_path = os.path.dirname(__file__) + "/result/"
    # 检测是否存在文件夹
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    np.save(output_path + "rewards_train.npy", rewards)
    np.save(output_path + "MA_rewards_train.npy", MA_rewards)
    np.save(output_path + "steps_train.npy", steps)
Example #4
0
File: main.py Project: pprp/52RL
def test(cfg):

    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    env = CliffWalkingWapper(env)
    agent = QLearning(
        obs_dim=env.observation_space.n,
        action_dim=env.action_space.n,
        learning_rate=cfg.policy_lr,
        gamma=cfg.gamma,
        epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay)
    agent.load()  # 导入保存的模型
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps
    for i_episode in range(1, 10+1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.predict(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            obs = next_obs  # 存储上一个观察值
            time.sleep(0.5)
            env.render()
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(
                0.9*MA_rewards[-1]+0.1*ep_reward)
        print('Episode %s: steps = %s , reward = %.1f' %
              (i_episode, ep_steps, ep_reward))
    plt.plot(MA_rewards)
    plt.show()
import numpy as np
from itertools import cycle
import random
import sys

from agent import QLearning
import pygame
from pygame.locals import *

from environment import create_uniform_grid, map_position_tile

bot = QLearning(2)

FPS = 30
SCREENWIDTH = 288
SCREENHEIGHT = 512
game_grid = create_uniform_grid(SCREENHEIGHT, SCREENWIDTH)
PIPEGAPSIZE = 100  # gap between upper and lower part of pipe
BASEY = SCREENHEIGHT * 0.79
# image, sound and hitmask  dicts
IMAGES, SOUNDS, HITMASKS = {}, {}, {}

# list of all possible players (tuple of 3 positions of flap)
PLAYERS_LIST = (
    # red bird
    (
        'assets/sprites/redbird-upflap.png',
        'assets/sprites/redbird-midflap.png',
        'assets/sprites/redbird-downflap.png',
    ),
    # blue bird
sarsa.save_q_values()
sarsa_df = pd.DataFrame(data=sarsa_scores,
                        index=n_episodes_range,
                        columns=['score'])

# In[10]:

plot_training_results(sarsa_df, "Sarsa",
                      f"sarsa_training_scores_alpha_{alpha}_gamma_{gamma}")

# ## Q-Learning

# In[11]:

qlearning = QLearning(action_space=env.action_space, alpha=alpha, gamma=gamma)

# In[12]:

q_learning_scores = []

for i_episode in range(1, n_episodes + 1):
    env = FlappyEnvironment()
    if i_episode % 100 == 0:
        print("\rEpisode {}/{} - Max Score {}".format(
            i_episode, n_episodes,
            np.array(q_learning_scores).max()),
              end="")
        sys.stdout.flush()

    qlearning.learn(env)
Example #7
0
            # Gui 렌더링
            env.render()

            # 에이전트로부터 해당 상태에 대한 행동을 받아옴
            action = agent.get_action(str(state))

            # 에이전트의 행동을 취하고 다음 상태와 보상과 에피소드가 끝났는지의 여부를 받아옴
            state_, reward, done = env.step(action)

            # 에이전트의 learn 함수에 S A R S_ 를 넣어줌
            agent.learn(str(state), action, reward, str(state_))

            # 현재 상태에 다음 상태를 대입,
            state = state_

            env.print_value_all(agent.q_table)

            # 에피소드가 끝나면 break
            if done:
                break

    # 모든 에피소드가 다 끝나면 게임오버
    print('game over')
    # env.destroy()


if __name__ == "__main__":
    env = Env()
    agent = QLearning(actions=list(range(env.n_actions)))
    update()