Esempio n. 1
0
    def test_dqn_doesnt_store_invalid_transitions(self):

        STEPS = 55

        env = DummyEnv()
        test_env = DummyEnv()

        model = tf.keras.Sequential([tf.keras.layers.Dense(2, input_dim=2)])
        model.compile(tf.keras.optimizers.SGD(learning_rate=0.), loss="mse")

        agent = DQN.from_environment(env,
                                     model,
                                     discount_gamma=0.,
                                     use_target_network=False)

        rollout = Rolling(agent, env)
        test_rollout = Trajectory(agent, test_env)

        rollout.fit(epochs=10,
                    updates_per_epoch=12,
                    steps_per_update=STEPS,
                    update_batch_size=8,
                    testing_rollout=test_rollout,
                    buffer_warmup=False)

        data = agent.memory_sampler.sample(-1)

        np.testing.assert_array_less(data["state"], 10)
        np.testing.assert_equal(
            (data["state_next"] - data["state"]).sum(axis=1), 1)
Esempio n. 2
0
    def test_td3_doesnt_store_invalid_transitions(self):

        STEPS = 55

        env = DummyEnv(action_space="continuous")
        test_env = DummyEnv(action_space="continuous")

        actor_out = tf.convert_to_tensor([[0., 0.]])
        critic_out = tf.convert_to_tensor([1.])

        actor = arch.TestingModel(actor_out)
        critic = arch.TestingModel(critic_out)
        critic2 = arch.TestingModel(critic_out)

        actor_target = arch.TestingModel(actor_out)
        critic_target = arch.TestingModel(critic_out)
        critic2_target = arch.TestingModel(critic_out)

        actor.optimizer = tf.keras.optimizers.SGD(0)
        critic.optimizer = tf.keras.optimizers.SGD(0)
        critic2.optimizer = tf.keras.optimizers.SGD(0)

        agent = TD3(actor,
                    actor_target,
                    critic,
                    critic_target,
                    critic2,
                    critic2_target,
                    discount_gamma=0.,
                    polyak_tau=0.,
                    action_minima=-1.,
                    action_maxima=1.,
                    update_actor_every=1)

        rollout = Rolling(agent, env)
        test_rollout = Trajectory(agent, test_env)

        rollout.fit(epochs=10,
                    updates_per_epoch=12,
                    steps_per_update=STEPS,
                    update_batch_size=8,
                    testing_rollout=test_rollout,
                    buffer_warmup=False)

        data = agent.memory_sampler.sample(-1)

        np.testing.assert_array_less(data["state"], 10)
        np.testing.assert_equal(
            (data["state_next"] - data["state"]).sum(axis=1), 1)
Esempio n. 3
0
    def test_a2c_doesnt_store_invalid_transitions_in_td_setting(self):

        STEPS = 35

        env = DummyEnv()
        agent = A2C.from_environment(env, discount_gamma=0.)
        rollout = Rolling(agent, env)

        rollout.roll(STEPS, verbose=0, push_experience=True)

        data = agent.memory_sampler.sample(-1)

        self.assertEqual(agent.episodes, 3)
        np.testing.assert_array_less(data["state"], 10)
        self.assertEqual(len(data["state"]), STEPS - 4)
Esempio n. 4
0
          activation="relu",
          input_shape=input_shape,
          kernel_initializer="he_uniform"),
    Dense(16, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer="he_uniform")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
    episode_actor_entropy = []
    episode_critic_loss = []

    for update in range(32):
Esempio n. 5
0
from trickster.agent import DoubleDQN
from trickster.rollout import Rolling, Trajectory, RolloutConfig
from trickster.experience import Experience
from trickster.model import mlp
from trickster.utility import gymic

env = gymic.rwd_scaled_env("CartPole-v1")
test_env = gymic.rwd_scaled_env("CartPole-v1")

input_shape = env.observation_space.shape
num_actions = env.action_space.n

ann = mlp.wide_mlp_critic_network(input_shape, num_actions, adam_lr=1e-3)

agent = DoubleDQN(ann,
                  action_space=env.action_space,
                  memory=Experience(max_length=10000),
                  epsilon=1.,
                  epsilon_decay=0.99995,
                  epsilon_min=0.1,
                  discount_factor_gamma=0.98)


rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, test_env)

rollout.fit(episodes=500, updates_per_episode=32, step_per_update=2, update_batch_size=32,
            testing_rollout=test_rollout, plot_curves=True)
test_rollout.render(repeats=10)
Esempio n. 6
0
num_actions = env.action_space.shape[0]

actor, critics = mlp.wide_ddpg_actor_critic(input_shape,
                                            output_dim=num_actions,
                                            action_range=2,
                                            num_critics=2,
                                            actor_lr=5e-4,
                                            critic_lr=5e-4)

agent = TD3(actor,
            critics,
            action_space=spaces.CONTINUOUS,
            memory=Experience(max_length=int(1e4)),
            discount_factor_gamma=0.99,
            action_noise_sigma=0.1,
            action_noise_sigma_decay=1.,
            action_minima=-2,
            action_maxima=2,
            target_noise_sigma=0.2,
            target_noise_clip=0.5)

rollout = Rolling(agent, env)
test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True))

rollout.fit(episodes=1000,
            updates_per_episode=64,
            step_per_update=1,
            update_batch_size=32,
            testing_rollout=test_rollout)
test_rollout.render(repeats=10)
Esempio n. 7
0
test_env = Match(cfg)

ann = mlp.wide_dueling_q_network(env.observation_space.shape,
                                 env.action_space.n,
                                 adam_lr=1e-4)

experience = Experience(10000)
agent = DoubleDQN(ann,
                  env.action_space,
                  experience,
                  epsilon=1.,
                  epsilon_decay=1.,
                  epsilon_min=0.1)

rcfg = RolloutConfig(max_steps=1024, skipframes=2)
training_rollout = Rolling(agent, env, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

print("Filling experience...")
while experience.N < 10000:
    training_rollout.roll(steps=32, verbose=0, push_experience=True)
    print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="")
print()
agent.epsilon_decay = 0.99995

logger = history.History("reward_sum", *agent.history_keys, "epsilon")

for episode in range(1, 501):

    for update in range(32):
        training_rollout.roll(steps=32, verbose=0, push_experience=True)
Esempio n. 8
0
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.MaxPool2D(),  # 5
    keras.layers.GlobalAveragePooling2D(),  # 16
    keras.layers.Dense(4, kernel_initializer="he_uniform"),
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.Dense(2, kernel_initializer="he_uniform")
])
qnet.compile(keras.optimizers.Adam(1e-3), "mse")

agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99,
            epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True,
            state_preprocessor=None)

rollout = Rolling(agent, env, RolloutConfig(skipframes=2))
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
losses = deque(maxlen=100)

episode = 0

while 1:

    episode += 1

    rollout.roll(steps=4, verbose=0, push_experience=True)
    if agent.memory.N < 1000:
        print(f"\rFilling memory... {agent.memory.N}/1000", end="")
        continue
Esempio n. 9
0
from trickster.agent import A2C
from trickster.rollout import Rolling, Trajectory, RolloutConfig
from trickster.model import mlp
from trickster.utility import gymic

env = gymic.rwd_scaled_env()
input_shape = env.observation_space.shape
num_actions = env.action_space.n

actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions)

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gymic.rwd_scaled_env())

rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, testing_rollout=test_rollout, plot_curves=True)
test_rollout.render(repeats=10)
Esempio n. 10
0
critic = Sequential([
    Dense(400, activation="relu", input_shape=input_shape),
    Dense(300, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(1e-4))

agent = A2C(actor,
            critic,
            action_space=num_actions,
            absolute_memory_limit=10000,
            discount_factor_gamma=0.99,
            entropy_penalty_coef=0.05)

rollout = Rolling(agent.create_workers(1)[0], env)
test_rollout = Trajectory(agent, Lunar())


def train():
    hst = history.History("reward_sum", "actor_loss", "actor_utility",
                          "actor_utility_std", "actor_entropy", "values",
                          "advantages", "critic_loss")

    for episode in range(1, 1001):

        for update in range(1, 4):
            rollout.roll(steps=128, verbose=0, push_experience=True)
            agent_history = agent.fit(batch_size=-1,
                                      verbose=0,
                                      reset_memory=True)
Esempio n. 11
0
    Dense(24, activation="relu", input_shape=input_shape),
    Dense(24, activation="relu"),
    Dense(num_actions, activation="linear")
])
policy.compile(loss="mse", optimizer=Adam(1e-4))

agent = DQN(policy,
            action_space=num_actions,
            memory=Experience(max_length=10000),
            epsilon=1.,
            epsilon_decay=1.,
            epsilon_min=0.1,
            discount_factor_gamma=0.99,
            use_target_network=True)

rollout = Rolling(agent, env)
test_rollout = Trajectory(agent, env)

hst = history.History("reward_sum", "loss", "epsilon")

for warmup in range(1, 33):
    rollout.roll(32, verbose=0, push_experience=True)
agent.epsilon_decay = 0.99999

for episode in range(1, 1001):
    rollout.roll(steps=32, verbose=0, push_experience=True)
    agent_history = agent.fit(updates=10, batch_size=64, verbose=0)
    test_history = test_rollout.rollout(verbose=0,
                                        push_experience=False,
                                        render=False)
    hst.record(reward_sum=test_history["reward_sum"],