actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4)) critic = Sequential([ Dense(16, activation="relu", input_shape=input_shape, kernel_initializer="he_uniform"), Dense(16, activation="relu", kernel_initializer="he_uniform"), Dense(1, activation="linear", kernel_initializer="he_uniform") ]) critic.compile(loss="mse", optimizer=Adam(5e-4)) agent = A2C(actor, critic, action_space=env.action_space, memory=Experience(max_length=10000), discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_entropy = [] critic_loss = [] for episode in range(1, 1001): episode_actor_loss = [] episode_actor_utility = []
Dense(16, activation="relu"), Dense(num_actions, activation="softmax") ]) actor.compile(loss="categorical_crossentropy", optimizer=Adam(ACTOR_ADAM_LR)) critic = Sequential([ Dense(16, activation="relu", input_shape=input_shape), Dense(16, activation="relu"), Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR)) agent = PPO(actor, critic, action_space=test_env.action_space, memory=Experience(max_length=EXPERIENCE_SIZE), reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA, entropy_penalty_coef=ENTROPY_PENALTY_BETA) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_kld = [] actor_entropy = [] critic_loss = []
envs = [FakeEnv() for _ in range(10)] test_env = FakeEnv() actor = Sequential([ # 200, 160 Flatten(input_shape=test_env.shape), Dense(256), BatchNormalization(), LeakyReLU(), Dense(2, activation="softmax") ]) actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy") agent = REINFORCE(actor, 2, Experience(), discount_factor_gamma=0.99, state_preprocessor=None) rollout = MultiTrajectory([agent for _ in range(10)], envs) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=100) actor_loss = deque(maxlen=100) actor_utility = deque(maxlen=100) actor_entropy = deque(maxlen=100) critic_loss = deque(maxlen=100) episode = 0 while 1:
class TestExperienceSample(unittest.TestCase): def setUp(self): self.xp = Experience(max_length=100) self.sampler = ExperienceSampler(self.xp) self.a = np.arange(100) self.b = self.a - 100 self.c = self.a / 10 def test_sampling_of_next_states(self): self.xp.remember(self.a) states, next_states = self.sampler.sample(10) diff = next_states - states self.assertTrue(np.all(diff == 1)) def test_sampling_when_samples_are_fewer_than_sample_size(self): self.xp.remember(self.a) states, next_states = self.sampler.sample(200) self.assertTrue(len(states) == len(self.a) - 1) self.assertTrue(len(next_states) == len(self.a) - 1) def test_last_state_doesnt_get_sampled(self): self.xp.remember(self.a) states, next_states = self.sampler.sample(200) self.assertNotIn(self.a[-1], states) def test_excluded_state_doesnt_get_sampled(self): EXCLUDE = (10, 20, 30) self.xp.remember(self.a, exclude=EXCLUDE) states, next_states = self.sampler.sample(200) for x in EXCLUDE: self.assertNotIn(x, states) def test_excluding_negative_index_is_correctly_interpreted(self): EXCLUDE = (-1, -10, -20) TARGET = (99, 90, 80) self.xp.remember(self.a, exclude=EXCLUDE) states, next_states = self.sampler.sample(200) for t in TARGET: self.assertNotIn(t, states) def test_excluding_works_after_multiple_remembers(self): EXCLUDE = (10, 20, 30) for _ in range(3): self.xp.remember(self.a, exclude=EXCLUDE) states, next_states = self.sampler.sample(-1) for e in EXCLUDE: self.assertNotIn(e, states)
def setUp(self): self.xp = Experience(max_length=100) self.sampler = ExperienceSampler(self.xp) self.a = np.arange(100) self.b = self.a - 100 self.c = self.a / 10
from trickster.utility import history, visual from trickster.model import mlp cfg = MatchConfig(canvas_size=(128, 128), players_per_side=2, learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT, observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR) env = Match(cfg) test_env = Match(cfg) ann = mlp.wide_dueling_q_network(env.observation_space.shape, env.action_space.n, adam_lr=1e-4) experience = Experience(10000) agent = DoubleDQN(ann, env.action_space, experience, epsilon=1., epsilon_decay=1., epsilon_min=0.1) rcfg = RolloutConfig(max_steps=1024, skipframes=2) training_rollout = Rolling(agent, env, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) print("Filling experience...") while experience.N < 10000: training_rollout.roll(steps=32, verbose=0, push_experience=True) print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="")
env = gymic.rwd_scaled_env("LunarLanderContinuous-v2", reward_scale=0.01) input_shape = env.observation_space.shape num_actions = env.action_space.shape[0] actor, critics = mlp.wide_ddpg_actor_critic(input_shape, output_dim=num_actions, action_range=2, num_critics=2, actor_lr=5e-4, critic_lr=5e-4) agent = TD3(actor, critics, action_space=spaces.CONTINUOUS, memory=Experience(max_length=int(1e4)), discount_factor_gamma=0.99, action_noise_sigma=0.1, action_noise_sigma_decay=1., action_minima=-2, action_maxima=2, target_noise_sigma=0.2, target_noise_clip=0.5) rollout = Rolling(agent, env) test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True)) rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, update_batch_size=32,
def test_experience_constructor_creates_empty_object(self): xp = Experience() self.assertIsNone(xp.memoirs) self.assertEqual(xp.N, 0)
keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.MaxPool2D(), # 10 keras.layers.Conv2D(16, 3, kernel_initializer="he_uniform", padding="same"), keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.MaxPool2D(), # 5 keras.layers.GlobalAveragePooling2D(), # 16 keras.layers.Dense(4, kernel_initializer="he_uniform"), keras.layers.BatchNormalization(), keras.layers.ReLU(), keras.layers.Dense(2, kernel_initializer="he_uniform") ]) qnet.compile(keras.optimizers.Adam(1e-3), "mse") agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99, epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True, state_preprocessor=None) rollout = Rolling(agent, env, RolloutConfig(skipframes=2)) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=100) losses = deque(maxlen=100) episode = 0 while 1: episode += 1
def test_remember_considers_max_size(self): xp = Experience(max_length=100) xp.remember(np.arange(120)) self.assertTrue(xp.N, 100) self.assertListEqual(xp.memoirs[0].tolist(), list(range(20, 120)))
def test_experience_remembers_array(self): xp = Experience() xp.remember(np.arange(100)) self.assertEqual(xp.N, 100) self.assertListEqual(xp.memoirs[0].tolist(), list(range(100)))
def test_experience_constructor_considers_max_size_argument(self): xp = Experience(max_length=3) self.assertEqual(xp.max_length, 3)
def reset(self): self.initial_state = self.env.reset() return self.empty envs = [FakeEnv() for _ in range(10)] test_env = FakeEnv() actor = Sequential([ # 200, 160 Flatten(input_shape=test_env.shape), Dense(200, activation="relu"), Dense(2, activation="softmax") ]) actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy") agent = REINFORCE(actor, 2, Experience(), discount_factor_gamma=0.99, state_preprocessor=None) rollout = MultiTrajectory(agent, envs) test_rollout = Trajectory(agent, test_env) rewards = deque(maxlen=10) actor_loss = deque(maxlen=80) actor_utility = deque(maxlen=80) actor_entropy = deque(maxlen=80) critic_loss = deque(maxlen=80) episode = 0 while 1:
canvas_shape, action_shape = env.neurons_required actor_input = Input(shape=[64, 64, 3], name="actor_input") critic_input = Input(shape=[64, 64, 3], name="critic_input") critic_stream = Flatten()(critic_input) critic_stream = Dense(64, activation="tanh")(critic_stream) critic_stream = BatchNormalization()(critic_stream) critic_stream = Dense(32, activation="tanh")(critic_stream) critic_stream = BatchNormalization()(critic_stream) value_estimate = Dense(NUM_MOVES, activation="softmax")(critic_stream) critic = Model(critic_input, value_estimate, name="Critic") critic.compile(Adam(5e-4), "mse") agent = DQN(critic, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, epsilon=0.7, state_preprocessor=lambda state: state / 255. - 0.5) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=512, skipframes=2)) test_rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=512, skipframes=2)) episode = 0 reward_memory = deque(maxlen=10) losses = deque(maxlen=10) while 1: episode += 1 episode_losses = [] for update in range(32): rollout.roll(steps=4, verbose=0, push_experience=True)