episode_count += 1 reward_history.append(episode_reward) pb.set_description( f"episode: {episode_count}, reward: {episode_reward}, eps: {eps_schedule.get(i)*100:.2f}%" ) plotter.plot('episode reward', 'episode return', "Episode Return", episode_count, episode_reward) plotter.plot('episode reward', 'average return', "Episode Return", episode_count, sum(reward_history) / len(reward_history)) episode_reward = 0 if episode_count > 0 and episode_count % params.save_frequency == 0: agent.save(chk_dir / f"checkpoint-episode-{episode_count}.pt") obs = next_obs if i < 0: continue if i % params.target_sync == 0: agent.sync_target() if i % params.train_frequency == 0: opt.zero_grad() *batch, _ = memory.sample(params.batch_size) loss = agent.calculate_loss(*batch) loss.mean().backward() opt.step() pb.update(1)
memory.store_effect(idx, action, np.sign(reward), done) if done: next_obs = env.reset() episode_count += 1 reward_history.append(episode_reward) pb.set_description(f"episode: {episode_count}, reward: {episode_reward}, eps: {eps_schedule.get(i)*100:.2f}%") plotter.plot('episode reward', 'episode return', "Episode Return", episode_count, episode_reward) plotter.plot('episode reward', 'average return', "Episode Return", episode_count, sum(reward_history) / len(reward_history)) episode_reward = 0 if episode_count > 0 and episode_count % params.save_frequency == 0: agent.save(chk_dir/ f"checkpoint-episode-{episode_count}.pt") # torch.save(memory, chk_dir / f"memory.pt") obs = next_obs if i < 0: continue if i % params.target_sync == 0: agent.sync_target() if i % params.train_frequency == 0: opt.zero_grad() *batch, _ = memory.sample(params.batch_size) loss = agent.calculate_loss(*batch, double=True) loss.mean().backward() opt.step() pb.update(1)