policy = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

    rewards.append(rollout_history["reward_sum"])
    losses.append(agent_history["loss"])

    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])),
          end="")

    if episode % 10 == 0:
        print()

visual.plot_vectors([rewards, losses], ["Rewards", "Utility"],
                    smoothing_window_size=10)
Exemple #2
0
        rollout.roll(steps=2, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_actor_loss.append(agent_history["actor_loss"])
        episode_actor_utility.append(agent_history["actor_utility"])
        episode_actor_entropy.append(agent_history["actor_entropy"])
        episode_critic_loss.append(agent_history["critic_loss"])

    test_history = test_rollout.rollout(verbose=0, push_experience=False)

    rewards.append(test_history["reward_sum"])
    actor_loss.append(sum(episode_actor_loss) / len(episode_actor_loss))
    actor_utility.append(
        sum(episode_actor_utility) / len(episode_actor_utility))
    actor_entropy.append(
        sum(episode_actor_entropy) / len(episode_actor_entropy))
    critic_loss.append(sum(episode_critic_loss) / len(episode_critic_loss))

    print(
        "\rEpisode {:>4} RWD {:>5.2f} ACTR {:>7.4f} UTIL {:>7.4f} ENTR {:>7.4f} CRIT {:>7.4f}"
        .format(episode, np.mean(rewards[-10:]), np.mean(actor_loss[-10:]),
                np.mean(actor_utility[-10:]), np.mean(actor_entropy[-10:]),
                np.mean(critic_loss[-10:])),
        end="")
    if episode % 10 == 0:
        print()

visual.plot_vectors(
    [rewards, actor_loss, actor_utility, actor_entropy, critic_loss],
    ["Reward", "Actor Loss", "Actor Utility", "Actor Entropy", "Critic Loss"],
    smoothing_window_size=10)
Exemple #3
0
rollout = Trajectory(agent, env, RolloutConfig(max_steps=200))

rewards = []
losses = []

for warmup in range(1, 33):
    rollout.rollout(verbose=0, learning_batch_size=0)

for episode in range(1, 501):
    rollout._reset()
    episode_rewards = []
    episode_losses = []
    while not rollout.finished:
        roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=64)
        episode_rewards.append(roll_history["reward_sum"])
        episode_losses.append(roll_history["loss"])
        pass
    rewards.append(sum(episode_rewards))
    losses.append(sum(episode_losses) / len(episode_losses))
    print("\rEpisode {:>4} RWD {:>3.0f} LOSS {:.4f} EPS {:>6.2%}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:]), agent.epsilon),
          end="")
    agent.epsilon *= 0.995
    agent.epsilon = max(agent.epsilon, 0.01)
    if episode % 10 == 0:
        print()
        agent.push_weights()

visual.plot_vectors([losses, rewards], ["Loss", "Reward"],
                    smoothing_window_size=10)
Exemple #4
0
from trickster.rollout import MultiTrajectory, RolloutConfig
from trickster.utility import visual

envs = [gym.make("CartPole-v1") for _ in range(8)]
input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

policy = Sequential([Dense(16, activation="relu", input_shape=input_shape),
                     Dense(16, activation="relu"),
                     Dense(num_actions, activation="softmax")])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = MultiTrajectory(agent, envs, rollout_configs=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)
    rewards.append(rollout_history["rewards"])
    losses.append(agent_history["loss"])
    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="")
    if episode % 10 == 0:
        print()

visual.plot_vectors([rewards, losses], ["Reward", "Loss"], smoothing_window_size=10)
Exemple #5
0
for episode in range(1, 1001):
    rollout.reset()
    episode_rewards = []
    episode_a_losses = []
    episode_a_entropy = []
    episode_c_losses = []
    while 1:
        roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=0)
        if rollout.finished:
            break
        agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)
        episode_rewards.append(roll_history["reward_sum"])
        episode_a_losses.append(agent_history["actor_utility"])
        episode_c_losses.append(agent_history["critic_loss"])
        agent.memory._reset()

    rewards.append(sum(episode_rewards))
    actor_losses.append(sum(episode_a_losses) / len(episode_a_losses))
    critic_losses.append(sum(episode_c_losses) / len(episode_c_losses))
    print("\rEpisode {:>4} RWD {:>3.0f} ACTR {:.4f} CRIT {:.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(actor_losses[-10:]),
        np.mean(critic_losses[-10:])),
          end="")
    if episode % 10 == 0:
        print()

visual.plot_vectors([rewards, actor_losses, critic_losses],
                    ["Reward", "Actor Utility", "Critic Loss"],
                    smoothing_window_size=10)