policy = Sequential([ Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="softmax") ]) policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3)) agent = REINFORCE(policy, action_space=num_actions) rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300)) rewards = [] losses = [] for episode in range(1, 501): rollout_history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) rewards.append(rollout_history["reward_sum"]) losses.append(agent_history["loss"]) print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:])), end="") if episode % 10 == 0: print() visual.plot_vectors([rewards, losses], ["Rewards", "Utility"], smoothing_window_size=10)
rollout = MultiTrajectory([agent for _ in range(10)], envs) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=100) actor_loss = deque(maxlen=100) actor_utility = deque(maxlen=100) actor_entropy = deque(maxlen=100) critic_loss = deque(maxlen=100) episode = 0 while 1: episode += 1 history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0) rewards.append(history["mean_reward"]) actor_loss.append(agent_history["loss"]) actor_entropy.append(agent_history["entropy"]) print("\rEpisode {:>4} RWD {:>5.2f} LOSS {:>7.4f} ENTR {:>7.4}".format( episode, np.mean(rewards), np.mean(actor_loss), np.mean(actor_entropy)), end="") if episode % 100 == 0: print()