policy = Sequential([ Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="softmax") ]) policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3)) agent = REINFORCE(policy, action_space=num_actions) rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300)) rewards = [] losses = [] for episode in range(1, 501): rollout_history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) rewards.append(rollout_history["reward_sum"]) losses.append(agent_history["loss"]) print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="") if episode % 10 == 0: print() visual.plot_vectors([rewards, losses], ["Rewards", "Utility"], smoothing_window_size=10)
rollout.roll(steps=2, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=32, verbose=0) episode_actor_loss.append(agent_history["actor_loss"]) episode_actor_utility.append(agent_history["actor_utility"]) episode_actor_entropy.append(agent_history["actor_entropy"]) episode_critic_loss.append(agent_history["critic_loss"]) test_history = test_rollout.rollout(verbose=0, push_experience=False) rewards.append(test_history["reward_sum"]) actor_loss.append(sum(episode_actor_loss) / len(episode_actor_loss)) actor_utility.append( sum(episode_actor_utility) / len(episode_actor_utility)) actor_entropy.append( sum(episode_actor_entropy) / len(episode_actor_entropy)) critic_loss.append(sum(episode_critic_loss) / len(episode_critic_loss)) print( "\rEpisode {:>4} RWD {:>5.2f} ACTR {:>7.4f} UTIL {:>7.4f} ENTR {:>7.4f} CRIT {:>7.4f}" .format(episode, np.mean(rewards[-10:]), np.mean(actor_loss[-10:]), np.mean(actor_utility[-10:]), np.mean(actor_entropy[-10:]), np.mean(critic_loss[-10:])), end="") if episode % 10 == 0: print() visual.plot_vectors( [rewards, actor_loss, actor_utility, actor_entropy, critic_loss], ["Reward", "Actor Loss", "Actor Utility", "Actor Entropy", "Critic Loss"], smoothing_window_size=10)
rollout = Trajectory(agent, env, RolloutConfig(max_steps=200)) rewards = [] losses = [] for warmup in range(1, 33): rollout.rollout(verbose=0, learning_batch_size=0) for episode in range(1, 501): rollout._reset() episode_rewards = [] episode_losses = [] while not rollout.finished: roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=64) episode_rewards.append(roll_history["reward_sum"]) episode_losses.append(roll_history["loss"]) pass rewards.append(sum(episode_rewards)) losses.append(sum(episode_losses) / len(episode_losses)) print("\rEpisode {:>4} RWD {:>3.0f} LOSS {:.4f} EPS {:>6.2%}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:]), agent.epsilon), end="") agent.epsilon *= 0.995 agent.epsilon = max(agent.epsilon, 0.01) if episode % 10 == 0: print() agent.push_weights() visual.plot_vectors([losses, rewards], ["Loss", "Reward"], smoothing_window_size=10)
from trickster.rollout import MultiTrajectory, RolloutConfig from trickster.utility import visual envs = [gym.make("CartPole-v1") for _ in range(8)] input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n policy = Sequential([Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="softmax")]) policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3)) agent = REINFORCE(policy, action_space=num_actions) rollout = MultiTrajectory(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) rewards = [] losses = [] for episode in range(1, 501): rollout_history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) rewards.append(rollout_history["rewards"]) losses.append(agent_history["loss"]) print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="") if episode % 10 == 0: print() visual.plot_vectors([rewards, losses], ["Reward", "Loss"], smoothing_window_size=10)
for episode in range(1, 1001): rollout.reset() episode_rewards = [] episode_a_losses = [] episode_a_entropy = [] episode_c_losses = [] while 1: roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=0) if rollout.finished: break agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) episode_rewards.append(roll_history["reward_sum"]) episode_a_losses.append(agent_history["actor_utility"]) episode_c_losses.append(agent_history["critic_loss"]) agent.memory._reset() rewards.append(sum(episode_rewards)) actor_losses.append(sum(episode_a_losses) / len(episode_a_losses)) critic_losses.append(sum(episode_c_losses) / len(episode_c_losses)) print("\rEpisode {:>4} RWD {:>3.0f} ACTR {:.4f} CRIT {:.4f}".format( episode, np.mean(rewards[-10:]), np.mean(actor_losses[-10:]), np.mean(critic_losses[-10:])), end="") if episode % 10 == 0: print() visual.plot_vectors([rewards, actor_losses, critic_losses], ["Reward", "Actor Utility", "Critic Loss"], smoothing_window_size=10)