for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS # 前面是热身 state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training) obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done) # 加入记忆池中 if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) # 使用之前的记忆进行学习 if step % TARGET_UPDATE == 0: # 将policy的网络同步到Target中去 agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix) for ind, frame in enumerate(frames): with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp: frame.save(fp, format="png") agent.save(
unit="b") for step in progressive: if done: observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to(device).float() action = agent.run(state, training) obs, reward, done = env.step(action) obs_queue.append(obs) memory.push(env.make_folded_state(obs_queue), action, reward, done) if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE) if step % TARGET_UPDATE == 0: agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open(REWARD_PATH, "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n") if RENDER: prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix) for ind, frame in enumerate(frames): with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp: frame.save(fp, format="png") agent.save(
#pynvml.nvmlInit() #handle = pynvml.nvmlDeviceGetHandleByIndex(0) #meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) #print(meminfo.used) if c == 1: #在PER时加入memory时用TDerror来更新优先 state_next = env.make_state(obs_queue).to(device).float() value_next = agent.get_target_value(state_next) td_error = abs(GAMMA * value_next + reward - value_this) #用loss的绝对值来作为TDerror memory.push(env.make_folded_state(obs_queue), action, reward, done, td_error) else: memory.push(env.make_folded_state(obs_queue), action, reward, done) if step % POLICY_UPDATE == 0 and training: agent.learn(memory, BATCH_SIZE, c) #PER时学习到的LOSS要来更新优先级 if step % TARGET_UPDATE == 0: agent.sync() if step % EVALUATE_FREQ == 0: avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write( f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n" ) if RENDER: prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix)
observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) training = len(memory) > WARM_STEPS state = env.make_state(obs_queue).to( device).float() # 将长度5的观察队列做成state(只用到了后4个obs action = agent.run(state, training) # 根据policy network获得当前action obs, reward, done = env.step(action) # 运行一步 obs_queue.append(obs) # 将头pop,队列中剩后4个加1个新的 memory.store(env.make_folded_state(obs_queue), action, reward, done) # folded_state:[:4]是state,[1:]是next_state if step % POLICY_UPDATE == 0 and training: # 如果training,每过POLICY_UPDATE,就更新一次policy network agent.learn(memory, step) if step % TARGET_UPDATE == 0: # 每过TARGET_UPDATE,就更新一次target network agent.sync() if step % EVALUATE_FREQ == 0: # 每过EVALUATE_FREQ,就评价一次 avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step // EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n" ) # 可以从rewards.txt中画出学习曲线 if RENDER: # 如果RENDER,就绘图 prefix = f"eval_{step // EVALUATE_FREQ:03d}" os.mkdir(prefix) for ind, frame in enumerate(frames): with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp: frame.save(fp, format="png")
for step in progressive: #step=int if done: #新一轮游戏? observations, _, _ = env.reset() for obs in observations: obs_queue.append(obs) #将观察得到内容置入obs_queue # print(len(obs_queue)) training = len(memory) > WARM_STEPS #检测memory长度是否超过WARM_STEPS state = env.make_state(obs_queue).to(device).float() #根据观察转换为状态 action = agent.run(state, training) # 以epsilon为参数随机选择一个动作 obs, reward, done = env.step(action) # 执行动作获得r,和下一个状态 # print(len(obs)) 执行此语句得到obs的大小为1 obs_queue.append(obs) #将新观测结果插入obs_queue memory.push(env.make_folded_state(obs_queue), action, reward, done) # 保存经验 if step % POLICY_UPDATE == 0 and training: # 每 POLICY_UPDATE = 4 次训练,并且要满足memory大小大于WARM_STEPS=50_000 agent.learn(memory, BATCH_SIZE) # 从memory中随机采样BATCH_SIZE = 32来帮助更新 if step % TARGET_UPDATE == 0: # 每TARGET_UPDATE= 10_000轮将target网络更新为policy网络的权重 agent.sync() if step % EVALUATE_FREQ == 0: #保存当前状况 avg_reward, frames = env.evaluate(obs_queue, agent, render=RENDER) with open("rewards.txt", "a") as fp: fp.write(f"{step//EVALUATE_FREQ:3d} {step:8d} {avg_reward:.1f}\n" ) # 3d表示3位数的表达形式比如“000” if RENDER: prefix = f"eval_{step//EVALUATE_FREQ:03d}" os.mkdir(prefix) for ind, frame in enumerate(frames): with open(os.path.join(prefix, f"{ind:06d}.png"), "wb") as fp: frame.save(fp, format="png")