policy = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

    rewards.append(rollout_history["reward_sum"])
    losses.append(agent_history["loss"])

    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])),
          end="")

    if episode % 10 == 0:
        print()

visual.plot_vectors([rewards, losses], ["Rewards", "Utility"],
                    smoothing_window_size=10)
Beispiel #2
0
rollout = MultiTrajectory([agent for _ in range(10)], envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
actor_loss = deque(maxlen=100)
actor_utility = deque(maxlen=100)
actor_entropy = deque(maxlen=100)
critic_loss = deque(maxlen=100)

episode = 0

while 1:

    episode += 1

    history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0)

    rewards.append(history["mean_reward"])
    actor_loss.append(agent_history["loss"])
    actor_entropy.append(agent_history["entropy"])

    print("\rEpisode {:>4} RWD {:>5.2f} LOSS {:>7.4f} ENTR {:>7.4}".format(
        episode, np.mean(rewards), np.mean(actor_loss),
        np.mean(actor_entropy)),
          end="")

    if episode % 100 == 0:
        print()