def test_reinforce_doesnt_store_invalid_transitions(self): STEPS = 35 env = DummyEnv() agent = REINFORCE.from_environment(env, discount_gamma=0.) rollout = Rolling(agent, env) rollout.roll(STEPS, verbose=0, push_experience=True) data = agent.memory_sampler.sample(-1) self.assertEqual(agent.episodes, 3) np.testing.assert_array_less(data["state"], 10) self.assertEqual(len(data["state"]), STEPS - 4)
from trickster.agent import REINFORCE from trickster.rollout import Trajectory, RolloutConfig from trickster.utility import visual env = gym.make("CartPole-v1") input_shape = env.observation_space.shape num_actions = env.action_space.n policy = Sequential([ Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(num_actions, activation="softmax") ]) policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3)) agent = REINFORCE(policy, action_space=num_actions) rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300)) rewards = [] losses = [] for episode in range(1, 501): rollout_history = rollout.rollout(verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True) rewards.append(rollout_history["reward_sum"]) losses.append(agent_history["loss"]) print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format( episode, np.mean(rewards[-10:]), np.mean(losses[-10:])),
actor_stream = LeakyReLU()(actor_stream) actor_stream = Conv2D(64, (3, 3), strides=(2, 2), padding="same")(actor_stream) # 4 actor_stream = LeakyReLU()(actor_stream) actor_stream = Conv2D(128, (4, 4))(actor_stream) actor_stream = LeakyReLU()(actor_stream) actor_stream = Dense(32, activation="relu")(actor_stream) action_probs = Dense(NUM_MOVES, activation="softmax")(actor_stream) actor = Model(common_input, action_probs, name="Actor") actor.compile(Adam(1e-3), "categorical_crossentropy") agent = REINFORCE(actor, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, state_preprocessor=preprocess) screen = CV2Screen(scale=2) episode = 0 reward_memory = deque(maxlen=100) critic_losses = deque(maxlen=50) actor_losses = deque(maxlen=100) rollout = MultiTrajectory(agent, envs, warmup_episodes=WARMUP, rollout_configs=RolloutConfig(max_steps=512, skipframes=4)) history = {"episode": 0}
envs = [FakeEnv() for _ in range(10)] test_env = FakeEnv() actor = Sequential([ # 200, 160 Flatten(input_shape=test_env.shape), Dense(256), BatchNormalization(), LeakyReLU(), Dense(2, activation="softmax") ]) actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy") agent = REINFORCE(actor, 2, Experience(), discount_factor_gamma=0.99, state_preprocessor=None) rollout = MultiTrajectory([agent for _ in range(10)], envs) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=100) actor_loss = deque(maxlen=100) actor_utility = deque(maxlen=100) actor_entropy = deque(maxlen=100) critic_loss = deque(maxlen=100) episode = 0 while 1:
from trickster.agent import REINFORCE from trickster.rollout import Trajectory, RolloutConfig from trickster.utility import gymic from trickster.model import mlp env = gymic.rwd_scaled_env() input_shape = env.observation_space.shape num_actions = env.action_space.n policy = mlp.wide_mlp_actor_categorical(input_shape, num_actions, adam_lr=1e-4) agent = REINFORCE(policy, action_space=num_actions) rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300)) rollout.fit(episodes=500, rollouts_per_update=1, update_batch_size=-1) rollout.render(repeats=10)
import keras from trickster.agent import REINFORCE from trickster.rollout import Trajectory from trickster.experience import Experience from trickster.utility import gymic from trickster.model import mlp env = gymic.rwd_scaled_env("LunarLander-v2") input_shape = env.observation_space.shape num_actions = env.action_space.n policy = mlp.wide_mlp_actor_categorical(input_shape, num_actions) policy.compile(optimizer=keras.optimizers.SGD(lr=2e-4, momentum=0.9), loss="categorical_crossentropy") agent = REINFORCE(policy, action_space=num_actions, memory=Experience(max_length=10000), discount_factor_gamma=0.99) rollout = Trajectory(agent, env) rollout.fit(episodes=1000, rollouts_per_update=16, update_batch_size=-1) rollout.render(repeats=10)