def main(): global render_bool # parl.connect('localhost:8037') if dummy_mode: render_bool = False if not render_bool: os.environ["SDL_VIDEODRIVER"] = "dummy" # else: # pygame.display.set_mode((800, 600 + 60)) # 创建环境 game = GameEnv() p = PLE(game, display_screen=render_bool, fps=1, force_fps=True ) # , fps=30, display_screen=render_bool, force_fps=True) p.init() # 根据parl框架构建agent print(p.getActionSet()) act_dim = len(p.getActionSet()) width, height = p.getScreenDims() rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 obs_dim = 2, 1, 13 model = Model(act_dim=act_dim) alg = RL_Alg(model, gamma=GAMMA, tau=0.001, actor_lr=LEARNING_RATE, critic_lr=LEARNING_RATE) agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim) # e_greed有一定概率随机选取动作,探索 # 加载模型 best_eval_reward = -1000 if os.path.exists('./model_dqn.ckpt'): print("loaded model:", './model_dqn.ckpt') agent.restore('./model_dqn.ckpt') best_eval_reward = evaluate(p, agent, render=render_bool) # run_episode(env, agent, train_or_test='test', render=True) # exit() # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(p, agent, rpm) max_episode = 200000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train part for i in range(0, 5): total_reward = run_episode(p, agent, rpm) episode += 1 # test part eval_reward = evaluate(p, agent, render=render_bool) # render=True 查看显示效果 logger.info('episode:{} e_greed:{} test_reward:{}'.format( episode, e_greed, eval_reward)) # 保存模型到文件 ./model.ckpt agent.save('./model_dqn_%d.ckpt' % rate_num) if best_eval_reward < eval_reward: best_eval_reward = eval_reward agent.save('./model_dqn.ckpt')
# print(next_state, reward, done, info) score += times # print(times) if next_state is not None: next_state = np.reshape(next_state, state_shape) # if next_state[0] > 0.95: # if 0 < next_state[1] < 0.312: # reward = min(-action[0] * 4 - 4, 0) # elif -0.312 < next_state[1] <= 0: # reward = min(action[0] * 4 - 4, 0) if episode == first_ep: env.start.append(copy.copy(state)) env.memorize([ copy.copy(x) for x in [state, action, reward, next_state, done] ]) # print(score) if done: print("episode: {}, score: {}\n".format(episode, score)) break state = next_state if len(env.memory) >= BATCH_SIZE: # and st % (MAX_STEPS/20) == 0: samples = env.get_samples(BATCH_SIZE) agent.train(samples) agent.update_target_net() if (episode + 1) % 10 == 0: agent.network_copy() agent.save(path) # if episode == first_ep+2: # break