Example #1
0
            episode_count += 1
            reward_history.append(episode_reward)

            pb.set_description(
                f"episode: {episode_count}, reward: {episode_reward}, eps: {eps_schedule.get(i)*100:.2f}%"
            )
            plotter.plot('episode reward', 'episode return', "Episode Return",
                         episode_count, episode_reward)
            plotter.plot('episode reward', 'average return', "Episode Return",
                         episode_count,
                         sum(reward_history) / len(reward_history))
            episode_reward = 0
            if episode_count > 0 and episode_count % params.save_frequency == 0:
                agent.save(chk_dir / f"checkpoint-episode-{episode_count}.pt")

        obs = next_obs

        if i < 0: continue

        if i % params.target_sync == 0:
            agent.sync_target()

        if i % params.train_frequency == 0:
            opt.zero_grad()
            *batch, _ = memory.sample(params.batch_size)
            loss = agent.calculate_loss(*batch)
            loss.mean().backward()
            opt.step()

        pb.update(1)
Example #2
0
        memory.store_effect(idx, action, np.sign(reward), done)

        if done:
            next_obs = env.reset()
            episode_count += 1
            reward_history.append(episode_reward)

            pb.set_description(f"episode: {episode_count}, reward: {episode_reward}, eps: {eps_schedule.get(i)*100:.2f}%")
            plotter.plot('episode reward', 'episode return', "Episode Return", episode_count, episode_reward)
            plotter.plot('episode reward', 'average return', "Episode Return", episode_count, sum(reward_history) / len(reward_history))
            episode_reward = 0
            if episode_count > 0 and episode_count % params.save_frequency == 0:
                agent.save(chk_dir/ f"checkpoint-episode-{episode_count}.pt")
                # torch.save(memory, chk_dir / f"memory.pt")

        obs = next_obs

        if i < 0: continue

        if i % params.target_sync == 0:
            agent.sync_target()

        if i % params.train_frequency == 0:
            opt.zero_grad()
            *batch, _ = memory.sample(params.batch_size)
            loss = agent.calculate_loss(*batch, double=True)
            loss.mean().backward()
            opt.step()

        pb.update(1)