def test_a2c_doesnt_store_invalid_transitions_in_td_setting(self): STEPS = 35 env = DummyEnv() agent = A2C.from_environment(env, discount_gamma=0.) rollout = Rolling(agent, env) rollout.roll(STEPS, verbose=0, push_experience=True) data = agent.memory_sampler.sample(-1) self.assertEqual(agent.episodes, 3) np.testing.assert_array_less(data["state"], 10) self.assertEqual(len(data["state"]), STEPS - 4)
test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_entropy = [] critic_loss = [] for episode in range(1, 1001): episode_actor_loss = [] episode_actor_utility = [] episode_actor_entropy = [] episode_critic_loss = [] for update in range(32): rollout.roll(steps=2, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=32, verbose=0) episode_actor_loss.append(agent_history["actor_loss"]) episode_actor_utility.append(agent_history["actor_utility"]) episode_actor_entropy.append(agent_history["actor_entropy"]) episode_critic_loss.append(agent_history["critic_loss"]) test_history = test_rollout.rollout(verbose=0, push_experience=False) rewards.append(test_history["reward_sum"]) actor_loss.append(sum(episode_actor_loss) / len(episode_actor_loss)) actor_utility.append( sum(episode_actor_utility) / len(episode_actor_utility)) actor_entropy.append( sum(episode_actor_entropy) / len(episode_actor_entropy)) critic_loss.append(sum(episode_critic_loss) / len(episode_critic_loss))
action_space=2, memory=Experience(max_length=10000), epsilon=1., discount_factor_gamma=0.98) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, test_env) rewards = [] losses = [] for episode in range(1, 501): episode_losses = [] for update in range(32): roll_history = rollout.roll(steps=4, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=32, verbose=0) episode_losses.append(agent_history["loss"]) test_history = test_rollout.rollout(verbose=0, push_experience=False, render=False) rewards.append(test_history["reward_sum"]) losses.append(np.mean(episode_losses)) print("\rEpisode {:>4} RWD {:>3.0f} LOSS {:.4f} EPS {:>6.2%}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:]), agent.epsilon), end="") agent.epsilon *= 0.992
experience = Experience(10000) agent = DoubleDQN(ann, env.action_space, experience, epsilon=1., epsilon_decay=1., epsilon_min=0.1) rcfg = RolloutConfig(max_steps=1024, skipframes=2) training_rollout = Rolling(agent, env, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) print("Filling experience...") while experience.N < 10000: training_rollout.roll(steps=32, verbose=0, push_experience=True) print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="") print() agent.epsilon_decay = 0.99995 logger = history.History("reward_sum", *agent.history_keys, "epsilon") for episode in range(1, 501): for update in range(32): training_rollout.roll(steps=32, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=1024, verbose=0, polyak_tau=0.1) logger.buffer(**agent_history) for _ in range(3): test_history = testing_rollout.rollout(verbose=0,