def test_dqn_doesnt_store_invalid_transitions(self): STEPS = 55 env = DummyEnv() test_env = DummyEnv() model = tf.keras.Sequential([tf.keras.layers.Dense(2, input_dim=2)]) model.compile(tf.keras.optimizers.SGD(learning_rate=0.), loss="mse") agent = DQN.from_environment(env, model, discount_gamma=0., use_target_network=False) rollout = Rolling(agent, env) test_rollout = Trajectory(agent, test_env) rollout.fit(epochs=10, updates_per_epoch=12, steps_per_update=STEPS, update_batch_size=8, testing_rollout=test_rollout, buffer_warmup=False) data = agent.memory_sampler.sample(-1) np.testing.assert_array_less(data["state"], 10) np.testing.assert_equal( (data["state_next"] - data["state"]).sum(axis=1), 1)
def test_td3_doesnt_store_invalid_transitions(self): STEPS = 55 env = DummyEnv(action_space="continuous") test_env = DummyEnv(action_space="continuous") actor_out = tf.convert_to_tensor([[0., 0.]]) critic_out = tf.convert_to_tensor([1.]) actor = arch.TestingModel(actor_out) critic = arch.TestingModel(critic_out) critic2 = arch.TestingModel(critic_out) actor_target = arch.TestingModel(actor_out) critic_target = arch.TestingModel(critic_out) critic2_target = arch.TestingModel(critic_out) actor.optimizer = tf.keras.optimizers.SGD(0) critic.optimizer = tf.keras.optimizers.SGD(0) critic2.optimizer = tf.keras.optimizers.SGD(0) agent = TD3(actor, actor_target, critic, critic_target, critic2, critic2_target, discount_gamma=0., polyak_tau=0., action_minima=-1., action_maxima=1., update_actor_every=1) rollout = Rolling(agent, env) test_rollout = Trajectory(agent, test_env) rollout.fit(epochs=10, updates_per_epoch=12, steps_per_update=STEPS, update_batch_size=8, testing_rollout=test_rollout, buffer_warmup=False) data = agent.memory_sampler.sample(-1) np.testing.assert_array_less(data["state"], 10) np.testing.assert_equal( (data["state_next"] - data["state"]).sum(axis=1), 1)
def test_a2c_doesnt_store_invalid_transitions_in_td_setting(self): STEPS = 35 env = DummyEnv() agent = A2C.from_environment(env, discount_gamma=0.) rollout = Rolling(agent, env) rollout.roll(STEPS, verbose=0, push_experience=True) data = agent.memory_sampler.sample(-1) self.assertEqual(agent.episodes, 3) np.testing.assert_array_less(data["state"], 10) self.assertEqual(len(data["state"]), STEPS - 4)
activation="relu", input_shape=input_shape, kernel_initializer="he_uniform"), Dense(16, activation="relu", kernel_initializer="he_uniform"), Dense(1, activation="linear", kernel_initializer="he_uniform") ]) critic.compile(loss="mse", optimizer=Adam(5e-4)) agent = A2C(actor, critic, action_space=env.action_space, memory=Experience(max_length=10000), discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_entropy = [] critic_loss = [] for episode in range(1, 1001): episode_actor_loss = [] episode_actor_utility = [] episode_actor_entropy = [] episode_critic_loss = [] for update in range(32):
from trickster.agent import DoubleDQN from trickster.rollout import Rolling, Trajectory, RolloutConfig from trickster.experience import Experience from trickster.model import mlp from trickster.utility import gymic env = gymic.rwd_scaled_env("CartPole-v1") test_env = gymic.rwd_scaled_env("CartPole-v1") input_shape = env.observation_space.shape num_actions = env.action_space.n ann = mlp.wide_mlp_critic_network(input_shape, num_actions, adam_lr=1e-3) agent = DoubleDQN(ann, action_space=env.action_space, memory=Experience(max_length=10000), epsilon=1., epsilon_decay=0.99995, epsilon_min=0.1, discount_factor_gamma=0.98) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, test_env) rollout.fit(episodes=500, updates_per_episode=32, step_per_update=2, update_batch_size=32, testing_rollout=test_rollout, plot_curves=True) test_rollout.render(repeats=10)
num_actions = env.action_space.shape[0] actor, critics = mlp.wide_ddpg_actor_critic(input_shape, output_dim=num_actions, action_range=2, num_critics=2, actor_lr=5e-4, critic_lr=5e-4) agent = TD3(actor, critics, action_space=spaces.CONTINUOUS, memory=Experience(max_length=int(1e4)), discount_factor_gamma=0.99, action_noise_sigma=0.1, action_noise_sigma_decay=1., action_minima=-2, action_maxima=2, target_noise_sigma=0.2, target_noise_clip=0.5) rollout = Rolling(agent, env) test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True)) rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, update_batch_size=32, testing_rollout=test_rollout) test_rollout.render(repeats=10)
test_env = Match(cfg) ann = mlp.wide_dueling_q_network(env.observation_space.shape, env.action_space.n, adam_lr=1e-4) experience = Experience(10000) agent = DoubleDQN(ann, env.action_space, experience, epsilon=1., epsilon_decay=1., epsilon_min=0.1) rcfg = RolloutConfig(max_steps=1024, skipframes=2) training_rollout = Rolling(agent, env, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) print("Filling experience...") while experience.N < 10000: training_rollout.roll(steps=32, verbose=0, push_experience=True) print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="") print() agent.epsilon_decay = 0.99995 logger = history.History("reward_sum", *agent.history_keys, "epsilon") for episode in range(1, 501): for update in range(32): training_rollout.roll(steps=32, verbose=0, push_experience=True)
keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.MaxPool2D(), # 5 keras.layers.GlobalAveragePooling2D(), # 16 keras.layers.Dense(4, kernel_initializer="he_uniform"), keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.Dense(2, kernel_initializer="he_uniform") ]) qnet.compile(keras.optimizers.Adam(1e-3), "mse") agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99, epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True, state_preprocessor=None) rollout = Rolling(agent, env, RolloutConfig(skipframes=2)) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=100) losses = deque(maxlen=100) episode = 0 while 1: episode += 1 rollout.roll(steps=4, verbose=0, push_experience=True) if agent.memory.N < 1000: print(f"\rFilling memory... {agent.memory.N}/1000", end="") continue
from trickster.agent import A2C from trickster.rollout import Rolling, Trajectory, RolloutConfig from trickster.model import mlp from trickster.utility import gymic env = gymic.rwd_scaled_env() input_shape = env.observation_space.shape num_actions = env.action_space.n actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions) agent = A2C(actor, critic, action_space=env.action_space, discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gymic.rwd_scaled_env()) rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, testing_rollout=test_rollout, plot_curves=True) test_rollout.render(repeats=10)
critic = Sequential([ Dense(400, activation="relu", input_shape=input_shape), Dense(300, activation="relu"), Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(1e-4)) agent = A2C(actor, critic, action_space=num_actions, absolute_memory_limit=10000, discount_factor_gamma=0.99, entropy_penalty_coef=0.05) rollout = Rolling(agent.create_workers(1)[0], env) test_rollout = Trajectory(agent, Lunar()) def train(): hst = history.History("reward_sum", "actor_loss", "actor_utility", "actor_utility_std", "actor_entropy", "values", "advantages", "critic_loss") for episode in range(1, 1001): for update in range(1, 4): rollout.roll(steps=128, verbose=0, push_experience=True) agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)
Dense(24, activation="relu", input_shape=input_shape), Dense(24, activation="relu"), Dense(num_actions, activation="linear") ]) policy.compile(loss="mse", optimizer=Adam(1e-4)) agent = DQN(policy, action_space=num_actions, memory=Experience(max_length=10000), epsilon=1., epsilon_decay=1., epsilon_min=0.1, discount_factor_gamma=0.99, use_target_network=True) rollout = Rolling(agent, env) test_rollout = Trajectory(agent, env) hst = history.History("reward_sum", "loss", "epsilon") for warmup in range(1, 33): rollout.roll(32, verbose=0, push_experience=True) agent.epsilon_decay = 0.99999 for episode in range(1, 1001): rollout.roll(steps=32, verbose=0, push_experience=True) agent_history = agent.fit(updates=10, batch_size=64, verbose=0) test_history = test_rollout.rollout(verbose=0, push_experience=False, render=False) hst.record(reward_sum=test_history["reward_sum"],