Example #1
0
    def test_a2c_doesnt_store_invalid_transitions_in_td_setting(self):

        STEPS = 35

        env = DummyEnv()
        agent = A2C.from_environment(env, discount_gamma=0.)
        rollout = Rolling(agent, env)

        rollout.roll(STEPS, verbose=0, push_experience=True)

        data = agent.memory_sampler.sample(-1)

        self.assertEqual(agent.episodes, 3)
        np.testing.assert_array_less(data["state"], 10)
        self.assertEqual(len(data["state"]), STEPS - 4)
Example #2
0
])
actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4))

critic = Sequential([
    Dense(16,
          activation="relu",
          input_shape=input_shape,
          kernel_initializer="he_uniform"),
    Dense(16, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer="he_uniform")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
Example #3
0
from trickster.agent import A2C
from trickster.rollout import Rolling, Trajectory, RolloutConfig
from trickster.model import mlp
from trickster.utility import gymic

env = gymic.rwd_scaled_env()
input_shape = env.observation_space.shape
num_actions = env.action_space.n

actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions)

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gymic.rwd_scaled_env())

rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, testing_rollout=test_rollout, plot_curves=True)
test_rollout.render(repeats=10)
Example #4
0
    Dense(400, activation="relu", input_shape=input_shape),
    Dense(300, activation="relu"),
    Dense(num_actions, activation="softmax")
])
actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4))

critic = Sequential([
    Dense(400, activation="relu", input_shape=input_shape),
    Dense(300, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(1e-4))

agent = A2C(actor,
            critic,
            action_space=num_actions,
            absolute_memory_limit=10000,
            discount_factor_gamma=0.99,
            entropy_penalty_coef=0.05)

rollout = Rolling(agent.create_workers(1)[0], env)
test_rollout = Trajectory(agent, Lunar())


def train():
    hst = history.History("reward_sum", "actor_loss", "actor_utility",
                          "actor_utility_std", "actor_entropy", "values",
                          "advantages", "critic_loss")

    for episode in range(1, 1001):

        for update in range(1, 4):
Example #5
0
from trickster.model import mlp
from trickster.rollout import MultiRolling, Trajectory, RolloutConfig
from trickster.utility import gymic

NUM_ENVS = 8

envs = [gymic.rwd_scaled_env("LunarLander-v2") for _ in range(NUM_ENVS)]
test_env = gymic.rwd_scaled_env("LunarLander-v2")
input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions)

agent = A2C(actor,
            critic,
            action_space=num_actions,
            discount_factor_gamma=0.99,
            entropy_penalty_coef=0.05)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, test_env)

rollout.fit(episodes=1000,
            updates_per_episode=16,
            steps_per_update=1,
            update_batch_size=-1,
            testing_rollout=test_rollout,
            plot_curves=True)
test_rollout.render(repeats=10)
Example #6
0
critic_stream = Conv2D(32, (4, 4), strides=(2, 2), padding="same")(critic_stream)  # 16
critic_stream = LeakyReLU()(critic_stream)
critic_stream = Conv2D(64, (4, 4), strides=(2, 2), padding="same")(critic_stream)  # 8
critic_stream = LeakyReLU()(critic_stream)
critic_stream = Conv2D(1, (1, 1), padding="valid")(critic_stream)
value_estimate = GlobalAveragePooling2D()(critic_stream)
# value_estimate = Flatten()(critic_stream)

actor = Model(actor_input, action_probs, name="Actor")
actor.compile(SGD(1e-4, momentum=0.9), "categorical_crossentropy")
critic = Model(critic_input, value_estimate, name="Critic")
critic.compile(SGD(5e-4, momentum=0.9), "mse")

agent = A2C(actor, critic,
            action_space=MOVES,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.995,
            entropy_penalty_coef=0.0,
            state_preprocessor=lambda state: state / 255.)

episode = 1

reward_memory = deque(maxlen=10)
step_lengths = deque(maxlen=10)
critic_losses = deque(maxlen=10)
actor_losses = deque(maxlen=10)
actor_utility = deque(maxlen=10)
actor_entropy = deque(maxlen=10)

rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2))
Example #7
0
from trickster.agent import A2C
from trickster.rollout import MultiRolling, Trajectory, RolloutConfig
from trickster.model import mlp

cfg = MatchConfig(canvas_size=(100, 100),
                  players_per_side=2,
                  learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT,
                  observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR)

envs = [Match(cfg) for _ in range(8)]
test_env = Match(cfg)

actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape,
                                         envs[0].action_space.n,
                                         actor_lr=1e-4,
                                         critic_lr=1e-4)

agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1)

rcfg = RolloutConfig(max_steps=512, skipframes=2)

training_rollout = MultiRolling(agent, envs, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

training_rollout.fit(episodes=1000,
                     updates_per_episode=512,
                     steps_per_update=1,
                     testing_rollout=testing_rollout)
testing_rollout.render(repeats=10)