for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS # 前面是热身 state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training) obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done) # 加入记忆池中 if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) # 使用之前的记忆进行学习 if step % TARGET_UPDATE == 0: # 将policy的网络同步到Target中去 agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix) for ind, frame in enumerate(frames): with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp: frame.save(fp, format="png") agent.save( os.path.join( # 保存当前模型 SAVE_PREFIX, f"model_{step//EVALUATE_FREQ:03d}")) done = True
if done: # 开始新一轮环境 observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len( memory) > WARM_STEPS # len(memory) = step,50000,先存储一些数据用于后续训练 state = env.make_state(obs_queue).to(device).float() # 丢掉第一个状态(~) action = agent.run(state, training) # 选取动作a(以eps的概率随机选取动作a, 否则a由Q网络选取) obs, reward, done = env.step(action) # 执行动作a, 获取奖励reward和下一个状态s’ obs_queue.append(obs) # 加入新的状态 memory.push(env.make_folded_state(obs_queue), action, reward, done) # 储存(s,a,r)到经验池中 if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) # 训练 if step % TARGET_UPDATE == 0: agent.sync() # 将策略网络中的权重同步到目标网络 if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) # 让几个AI玩一下游戏,记录一下奖励 with open("rewards.txt", "a") as fp: # 文件记录奖励,用于后续画图 fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: # No~ prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix) for ind, frame in enumerate(frames): with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp: frame.save(fp, format="png") agent.save( os.path.join(SAVE_PREFIX, f"model_{step//EVALUATE_FREQ:03d}")) # 保存模型 done = True