Esempio n. 1
0
def main():
    global render_bool
    # parl.connect('localhost:8037')
    if dummy_mode:
        render_bool = False
    if not render_bool:
        os.environ["SDL_VIDEODRIVER"] = "dummy"
    # else:
    #     pygame.display.set_mode((800, 600 + 60))
    # 创建环境
    game = GameEnv()
    p = PLE(game, display_screen=render_bool, fps=1, force_fps=True
            )  # , fps=30, display_screen=render_bool, force_fps=True)

    p.init()

    # 根据parl框架构建agent
    print(p.getActionSet())
    act_dim = len(p.getActionSet())
    width, height = p.getScreenDims()
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池
    obs_dim = 2, 1, 13
    model = Model(act_dim=act_dim)
    alg = RL_Alg(model,
                 gamma=GAMMA,
                 tau=0.001,
                 actor_lr=LEARNING_RATE,
                 critic_lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim,
                  act_dim=act_dim)  # e_greed有一定概率随机选取动作,探索

    # 加载模型
    best_eval_reward = -1000

    if os.path.exists('./model_dqn.ckpt'):
        print("loaded model:", './model_dqn.ckpt')
        agent.restore('./model_dqn.ckpt')
        best_eval_reward = evaluate(p, agent, render=render_bool)
        # run_episode(env, agent, train_or_test='test', render=True)
        # exit()
    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(p, agent, rpm)

    max_episode = 200000
    # 开始训练
    episode = 0

    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train part
        for i in range(0, 5):
            total_reward = run_episode(p, agent, rpm)
            episode += 1
        # test part
        eval_reward = evaluate(p, agent,
                               render=render_bool)  # render=True 查看显示效果
        logger.info('episode:{}    e_greed:{}   test_reward:{}'.format(
            episode, e_greed, eval_reward))

        # 保存模型到文件 ./model.ckpt
        agent.save('./model_dqn_%d.ckpt' % rate_num)
        if best_eval_reward < eval_reward:
            best_eval_reward = eval_reward
            agent.save('./model_dqn.ckpt')
Esempio n. 2
0
            # print(next_state, reward, done, info)
            score += times
            # print(times)
            if next_state is not None:
                next_state = np.reshape(next_state, state_shape)
            # if next_state[0] > 0.95:
            #     if 0 < next_state[1] < 0.312:
            #         reward = min(-action[0] * 4 - 4, 0)
            #     elif -0.312 < next_state[1] <= 0:
            #         reward = min(action[0] * 4 - 4, 0)
            if episode == first_ep:
                env.start.append(copy.copy(state))
            env.memorize([
                copy.copy(x)
                for x in [state, action, reward, next_state, done]
            ])
            # print(score)
            if done:
                print("episode: {}, score: {}\n".format(episode, score))
                break
            state = next_state
            if len(env.memory) >= BATCH_SIZE:  # and st % (MAX_STEPS/20) == 0:
                samples = env.get_samples(BATCH_SIZE)
                agent.train(samples)
                agent.update_target_net()
        if (episode + 1) % 10 == 0:
            agent.network_copy()
            agent.save(path)
        # if episode == first_ep+2:
        #     break