コード例 #1
0
ファイル: train.py プロジェクト: wanghaic/parl-notes
def main():
    env = gym.make('Pong-v0')
    obs_dim = 80 * 80
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    # if os.path.exists('./model.ckpt'):
    #     agent.restore('./model.ckpt')

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Train Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=False)
            logger.info('Episode {}, Test reward: {}'.format(
                i + 1, total_reward))

    # save the parameters to ./model.ckpt
    agent.save('./model.ckpt')
コード例 #2
0
def main():
    # 创建环境
    env = gym.make('Pong-v0')
    obs_dim = 80 * 80
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists('model.ckpt'):
        agent.restore('model.ckpt')
        print("restore_succeed")

    eval_reward = evaluate(env, agent, render=True)
    return eval_reward
コード例 #3
0
def main():
    env = gym.make('CarRacing-v0')
    obs_dim = 28 * 32
    act_dim = 3
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    # model_name = './model_episode_100.ckpt'
    model_name = MODEL_NAME
    if os.path.exists(model_name):
        agent.restore(model_name)

    step_cnt = 0
    for i in range(1):
        obs = env.reset()
        warm_up(env)
        episode_reward = 0
        while True:
            obs = preprocess(obs)  # from shape (210, 160, 3) to (100800,)
            action = agent.predict(obs)
            action = convert_action(action)
            obs, reward, isOver, _ = env.step(action)
            add_to_frames(obs)
            if ENABLE_RENDER:
                env.render()
            episode_reward += reward
            if isOver:
                print(isOver, episode_reward, MODEL_NAME)
                # if ENABLE_RENDER:
                #     input()
                break

    env.close()

    if ENABLE_GIF_OUTPUT:
        print("gif writing")
        outfilename = MODEL_NAME + ".gif"
        imageio.mimsave(outfilename, frames, 'GIF', duration=0.02)
コード例 #4
0
def main():
    env = JumpGame()
    np.random.seed(0)

    action_dim = 2
    obs_shape = 13

    model = Model(act_dim=action_dim)
    algorithm = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(algorithm, obs_dim=obs_shape, act_dim=action_dim)

    # 加载模型
    if os.path.exists('./model.ckpt'):
        save_path = './model.ckpt'
        agent.restore(save_path)
        print("模型加载成功")

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(
                env, agent,
                render=False)  # render=True 查看渲染效果,需要在本地运行,AIStudio无法显示
            logger.info('Test reward: {}'.format(total_reward))
            save_path = './model/dqn_model_{}_{}.ckpt'.format(i, total_reward)
            agent.save(save_path)

    # 保存模型到文件 ./model.ckpt
    agent.save('./model.ckpt')
コード例 #5
0
def main():
    env = gym.make('CarRacing-v0')
    obs_dim = 28 * 32
    act_dim = 3  # simply straight left and right
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    # if os.path.exists('./model.ckpt'):
    #     agent.restore('./model.ckpt')

    for i in range(1):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Train Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=False)
            logger.info('Episode {}, Test reward: {}'.format(
                i + 1, total_reward))
            ckpt = './model_episode_{}.ckpt'.format(i + 1)
            agent.save(ckpt)

    # save the parameters to ckpt
    agent.save('./final.ckpt')

    env.close()
コード例 #6
0
    reward_arr -= np.mean(reward_arr)
    reward_arr /= np.std(reward_arr)
    return reward_arr


# 创建环境
env = gym.make('Pong-v0')
obs_dim = 80 * 80
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

# 根据parl框架构建agent
# 4. 请参考课堂Demo构建 agent,嵌套Model, PolicyGradient, Agent
#
model = Model(act_dim=act_dim)
alg = PolicyGradient(model, lr=LEARNING_RATE)
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

# 加载模型
# if os.path.exists('./model.ckpt'):
#     agent.restore('./model.ckpt')

for i in range(20000):
    obs_list, action_list, reward_list = run_episode(env, agent)
    # if i % 10 == 0:
    #     logger.info("Train Episode {}, Reward Sum {}.".format(i,
    #                                         sum(reward_list)))

    batch_obs = np.array(obs_list)
    batch_action = np.array(action_list)
    batch_reward = calc_reward_to_go(reward_list)
コード例 #7
0
    reward_arr /= np.std(reward_arr)
    return reward_arr


# 创建环境
env = gym.make('Pong-v0')
obs_dim = 80 * 80
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

# 根据parl框架构建agent
######################################################################
######################################################################
# 根据parl框架构建agent
model = Model(act_dim=act_dim)
algorithm = PolicyGradient(model, lr=LEARNING_RATE)
agent = Agent(
    algorithm,
    obs_dim=obs_dim,
    act_dim=act_dim)
######################################################################
######################################################################


# 加载模型
if os.path.exists('./model_11.ckpt'):
    agent.restore('./model_11.ckpt')
init_level = -7
np.random.seed(1024)
for i in range(1000):
    obs_list, action_list, reward_list = run_episode(env, agent)
コード例 #8
0
ファイル: TrainMaze.py プロジェクト: worksking/PARL-Sample
def trainTest():
    env = Maze()
    model = MazeModel(act_dim=ACT_DIM)
    alg = PolicyGradient(model, hyperparas={'lr': LEARNING_RATE})
    agent = MazeAgent(alg, obs_dim=OBS_DIM, act_dim=ACT_DIM)

    beforeTrainReward = []
    global MeanReward
    print('BeforeTrain:')
    for i in range(1, 129):
        obs_list, action_list, reward_list = run_train_episode(env, agent)
        MeanReward = MeanReward + (sum(reward_list) - MeanReward) / i
        beforeTrainReward.append(MeanReward)
        logger.info('Episode:{},nowReward: {:.2f},avgReward:{:.2f}'.format(
            i, np.sum(reward_list), MeanReward))
    global ErrorCount
    ErrorCountBeforeTrain = ErrorCount
    print('TrainStart!')
    trainReward = []
    MeanReward = 0

    #MCPG
    #迭代十万个episode
    for i in range(1, 100001):
        #采样
        obs_list, action_list, reward_list = run_train_episode(env, agent)
        #使用滑动平均的方式计算奖励的期望
        MeanReward = MeanReward + (sum(reward_list) - MeanReward) / i
        if i % 10 == 0:
            trainReward.append(MeanReward)
            logger.info(
                "Episode:{}, nowReward:{:.2f},avgReward:{:.2f}.".format(
                    i, sum(reward_list), MeanReward))
        if MeanReward >= 0 and i >= 256:
            break
        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        #通过backup的方式计算G(t),并进行归一化处理
        batch_reward = calc_discount_norm_reward(reward_list, GAMMA)
        #学习
        agent.learn(batch_obs, batch_action, batch_reward)

    print('TrainEnd!')
    input()
    MeanReward = 0
    testReward = []
    ErrorCount = 0
    for i in range(1, 129):
        all_reward = run_evaluate_episode(env, agent)
        logger.info('Test reward: {:.3f}'.format(all_reward))
        MeanReward = MeanReward + (all_reward - MeanReward) / i
        testReward.append(MeanReward)
        logger.info('Episode:{},nowReward: {:.2f},avgReward:{:.2f}'.format(
            i, all_reward, MeanReward))
    ErrorCountAfterTrain = ErrorCount

    print()
    print('ErrorCountBeforeTrain:{},ErrorCountAfterTrain:{}'.format(
        ErrorCountBeforeTrain, ErrorCountAfterTrain))
    plt.title('BeforeTrain')
    plt.xlabel('episode')
    plt.ylabel('AvgReward')
    plt.plot(beforeTrainReward)

    plt.figure()
    X = np.arange(0, len(trainReward))
    X *= 10
    plt.title('Training')
    plt.xlabel('episode')
    plt.ylabel('AvgReward')
    plt.plot(X, trainReward)

    plt.figure()
    plt.title('AfterTrain')
    plt.xlabel('episode')
    plt.ylabel('AvgReward')
    plt.plot(testReward)

    plt.show()