def test_a2c_doesnt_store_invalid_transitions_in_td_setting(self): STEPS = 35 env = DummyEnv() agent = A2C.from_environment(env, discount_gamma=0.) rollout = Rolling(agent, env) rollout.roll(STEPS, verbose=0, push_experience=True) data = agent.memory_sampler.sample(-1) self.assertEqual(agent.episodes, 3) np.testing.assert_array_less(data["state"], 10) self.assertEqual(len(data["state"]), STEPS - 4)
]) actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4)) critic = Sequential([ Dense(16, activation="relu", input_shape=input_shape, kernel_initializer="he_uniform"), Dense(16, activation="relu", kernel_initializer="he_uniform"), Dense(1, activation="linear", kernel_initializer="he_uniform") ]) critic.compile(loss="mse", optimizer=Adam(5e-4)) agent = A2C(actor, critic, action_space=env.action_space, memory=Experience(max_length=10000), discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gym.make("CartPole-v1")) rewards = [] actor_loss = [] actor_utility = [] actor_entropy = [] critic_loss = [] for episode in range(1, 1001): episode_actor_loss = [] episode_actor_utility = []
from trickster.agent import A2C from trickster.rollout import Rolling, Trajectory, RolloutConfig from trickster.model import mlp from trickster.utility import gymic env = gymic.rwd_scaled_env() input_shape = env.observation_space.shape num_actions = env.action_space.n actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions) agent = A2C(actor, critic, action_space=env.action_space, discount_factor_gamma=0.98, entropy_penalty_coef=0.01) rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, gymic.rwd_scaled_env()) rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, testing_rollout=test_rollout, plot_curves=True) test_rollout.render(repeats=10)
Dense(400, activation="relu", input_shape=input_shape), Dense(300, activation="relu"), Dense(num_actions, activation="softmax") ]) actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4)) critic = Sequential([ Dense(400, activation="relu", input_shape=input_shape), Dense(300, activation="relu"), Dense(1, activation="linear") ]) critic.compile(loss="mse", optimizer=Adam(1e-4)) agent = A2C(actor, critic, action_space=num_actions, absolute_memory_limit=10000, discount_factor_gamma=0.99, entropy_penalty_coef=0.05) rollout = Rolling(agent.create_workers(1)[0], env) test_rollout = Trajectory(agent, Lunar()) def train(): hst = history.History("reward_sum", "actor_loss", "actor_utility", "actor_utility_std", "actor_entropy", "values", "advantages", "critic_loss") for episode in range(1, 1001): for update in range(1, 4):
from trickster.model import mlp from trickster.rollout import MultiRolling, Trajectory, RolloutConfig from trickster.utility import gymic NUM_ENVS = 8 envs = [gymic.rwd_scaled_env("LunarLander-v2") for _ in range(NUM_ENVS)] test_env = gymic.rwd_scaled_env("LunarLander-v2") input_shape = envs[0].observation_space.shape num_actions = envs[0].action_space.n actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions) agent = A2C(actor, critic, action_space=num_actions, discount_factor_gamma=0.99, entropy_penalty_coef=0.05) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=300)) test_rollout = Trajectory(agent, test_env) rollout.fit(episodes=1000, updates_per_episode=16, steps_per_update=1, update_batch_size=-1, testing_rollout=test_rollout, plot_curves=True) test_rollout.render(repeats=10)
critic_stream = Conv2D(32, (4, 4), strides=(2, 2), padding="same")(critic_stream) # 16 critic_stream = LeakyReLU()(critic_stream) critic_stream = Conv2D(64, (4, 4), strides=(2, 2), padding="same")(critic_stream) # 8 critic_stream = LeakyReLU()(critic_stream) critic_stream = Conv2D(1, (1, 1), padding="valid")(critic_stream) value_estimate = GlobalAveragePooling2D()(critic_stream) # value_estimate = Flatten()(critic_stream) actor = Model(actor_input, action_probs, name="Actor") actor.compile(SGD(1e-4, momentum=0.9), "categorical_crossentropy") critic = Model(critic_input, value_estimate, name="Critic") critic.compile(SGD(5e-4, momentum=0.9), "mse") agent = A2C(actor, critic, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.995, entropy_penalty_coef=0.0, state_preprocessor=lambda state: state / 255.) episode = 1 reward_memory = deque(maxlen=10) step_lengths = deque(maxlen=10) critic_losses = deque(maxlen=10) actor_losses = deque(maxlen=10) actor_utility = deque(maxlen=10) actor_entropy = deque(maxlen=10) rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2)) test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2))
from trickster.agent import A2C from trickster.rollout import MultiRolling, Trajectory, RolloutConfig from trickster.model import mlp cfg = MatchConfig(canvas_size=(100, 100), players_per_side=2, learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT, observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR) envs = [Match(cfg) for _ in range(8)] test_env = Match(cfg) actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape, envs[0].action_space.n, actor_lr=1e-4, critic_lr=1e-4) agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1) rcfg = RolloutConfig(max_steps=512, skipframes=2) training_rollout = MultiRolling(agent, envs, rcfg) testing_rollout = Trajectory(agent, test_env, rcfg) training_rollout.fit(episodes=1000, updates_per_episode=512, steps_per_update=1, testing_rollout=testing_rollout) testing_rollout.render(repeats=10)