Exemple #1
0
    def test_dqn_doesnt_store_invalid_transitions(self):

        STEPS = 55

        env = DummyEnv()
        test_env = DummyEnv()

        model = tf.keras.Sequential([tf.keras.layers.Dense(2, input_dim=2)])
        model.compile(tf.keras.optimizers.SGD(learning_rate=0.), loss="mse")

        agent = DQN.from_environment(env,
                                     model,
                                     discount_gamma=0.,
                                     use_target_network=False)

        rollout = Rolling(agent, env)
        test_rollout = Trajectory(agent, test_env)

        rollout.fit(epochs=10,
                    updates_per_epoch=12,
                    steps_per_update=STEPS,
                    update_batch_size=8,
                    testing_rollout=test_rollout,
                    buffer_warmup=False)

        data = agent.memory_sampler.sample(-1)

        np.testing.assert_array_less(data["state"], 10)
        np.testing.assert_equal(
            (data["state_next"] - data["state"]).sum(axis=1), 1)
Exemple #2
0
    def test_td3_doesnt_store_invalid_transitions(self):

        STEPS = 55

        env = DummyEnv(action_space="continuous")
        test_env = DummyEnv(action_space="continuous")

        actor_out = tf.convert_to_tensor([[0., 0.]])
        critic_out = tf.convert_to_tensor([1.])

        actor = arch.TestingModel(actor_out)
        critic = arch.TestingModel(critic_out)
        critic2 = arch.TestingModel(critic_out)

        actor_target = arch.TestingModel(actor_out)
        critic_target = arch.TestingModel(critic_out)
        critic2_target = arch.TestingModel(critic_out)

        actor.optimizer = tf.keras.optimizers.SGD(0)
        critic.optimizer = tf.keras.optimizers.SGD(0)
        critic2.optimizer = tf.keras.optimizers.SGD(0)

        agent = TD3(actor,
                    actor_target,
                    critic,
                    critic_target,
                    critic2,
                    critic2_target,
                    discount_gamma=0.,
                    polyak_tau=0.,
                    action_minima=-1.,
                    action_maxima=1.,
                    update_actor_every=1)

        rollout = Rolling(agent, env)
        test_rollout = Trajectory(agent, test_env)

        rollout.fit(epochs=10,
                    updates_per_epoch=12,
                    steps_per_update=STEPS,
                    update_batch_size=8,
                    testing_rollout=test_rollout,
                    buffer_warmup=False)

        data = agent.memory_sampler.sample(-1)

        np.testing.assert_array_less(data["state"], 10)
        np.testing.assert_equal(
            (data["state_next"] - data["state"]).sum(axis=1), 1)
Exemple #3
0
    Flatten(input_shape=test_env.shape),
    Dense(256),
    BatchNormalization(),
    LeakyReLU(),
    Dense(2, activation="softmax")
])
actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy")

agent = REINFORCE(actor,
                  2,
                  Experience(),
                  discount_factor_gamma=0.99,
                  state_preprocessor=None)

rollout = MultiTrajectory([agent for _ in range(10)], envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
actor_loss = deque(maxlen=100)
actor_utility = deque(maxlen=100)
actor_entropy = deque(maxlen=100)
critic_loss = deque(maxlen=100)

episode = 0

while 1:

    episode += 1

    history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0)
Exemple #4
0
import gym

from trickster.agent import REINFORCE, A2C, PPO
from trickster.rollout import Trajectory
from trickster import callbacks

ENV_NAME = "LunarLanderContinuous-v2"
ALGO = "REINFORCE"
TRAJECTORY_MAX_STEPS = 100
EPOCHS = 1000
ROLLOUTS_PER_EPOCH = 4

env = gym.make(ENV_NAME)

algo = {"REINFORCE": REINFORCE, "A2C": A2C, "PPO": PPO}[ALGO]

agent = algo.from_environment(env)
rollout = Trajectory(agent, env, TRAJECTORY_MAX_STEPS)

cbs = [callbacks.ProgressPrinter(keys=rollout.progress_keys)]

rollout.fit(epochs=EPOCHS,
            updates_per_epoch=1,
            rollouts_per_update=4,
            callbacks=cbs)
rollout.render(repeats=100)
Exemple #5
0
ann = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(num_actions, activation="linear")
])
ann.compile(loss="mse", optimizer=Adam(1e-3))

agent = DoubleDQN(ann,
                  action_space=2,
                  memory=Experience(max_length=10000),
                  epsilon=1.,
                  discount_factor_gamma=0.98)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, test_env)

rewards = []
losses = []

for episode in range(1, 501):
    episode_losses = []

    for update in range(32):
        roll_history = rollout.roll(steps=4, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=32, verbose=0)
        episode_losses.append(agent_history["loss"])

    test_history = test_rollout.rollout(verbose=0,
                                        push_experience=False,
                                        render=False)
from trickster.agent import PPO
from trickster.rollout import MultiRolling, Trajectory
from trickster.utility import gymic
from trickster.model import mlp

envs = [gymic.rwd_scaled_env("LunarLander-v2") for _ in range(32)]
test_env = gymic.rwd_scaled_env("LunarLander-v2")

input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

actor = mlp.wide_mlp_actor_categorical(input_shape, num_actions, adam_lr=1e-4)
critic = mlp.wide_mlp_critic_network(input_shape, output_dim=1, adam_lr=1e-4)
agent = PPO(actor,
            critic,
            action_space=num_actions,
            discount_factor_gamma=0.99,
            entropy_penalty_coef=0.05)

rollout = MultiRolling(agent, envs)
test_rollout = Trajectory(agent, test_env)

rollout.fit(episodes=1000,
            updates_per_episode=1,
            steps_per_update=32,
            update_batch_size=32,
            testing_rollout=test_rollout,
            plot_curves=True)
test_rollout.render(repeats=10)
ALGO = "DQN"
NUM_ENVS = 4
TRAJECTORY_MAX_STEPS = 200
STEPS_PER_UPDATE = 1
UPDATES_PER_EPOCH = 64
EPOCHS = 200
UPDATE_BATCH_SIZE = 100

envs = [gym.make(ENV_NAME) for _ in range(NUM_ENVS)]
test_env = gym.make(ENV_NAME)

algo = {"DQN": DQN, "DoubleDQN": DoubleDQN}[ALGO]

agent = algo.from_environment(envs[0])

rollout = MultiRolling(agent, envs, TRAJECTORY_MAX_STEPS)
test_rollout = Trajectory(agent, test_env, TRAJECTORY_MAX_STEPS)

rollout.fit(epochs=EPOCHS,
            updates_per_epoch=UPDATES_PER_EPOCH,
            steps_per_update=STEPS_PER_UPDATE,
            update_batch_size=UPDATE_BATCH_SIZE,
            warmup_buffer=True,
            callbacks=[
                callbacks.TrajectoryEvaluator(testing_rollout=test_rollout,
                                              repeats=4),
                callbacks.ProgressPrinter(rollout.progress_keys)
            ])

test_rollout.render(repeats=10)
Exemple #8
0
policy = Sequential([
    Dense(24, activation="relu", input_shape=input_shape),
    Dense(24, activation="relu"),
    Dense(num_actions, activation="linear")
])
policy.compile(loss="mse", optimizer=Adam(1e-4))

agent = DQN(policy,
            action_space=num_actions,
            memory=Experience(max_length=10000),
            epsilon=1.,
            discount_factor_gamma=0.98,
            use_target_network=True)

rollout = Trajectory(agent, env, RolloutConfig(max_steps=200))

rewards = []
losses = []

for warmup in range(1, 33):
    rollout.rollout(verbose=0, learning_batch_size=0)

for episode in range(1, 501):
    rollout._reset()
    episode_rewards = []
    episode_losses = []
    while not rollout.finished:
        roll_history = rollout.roll(steps=2, verbose=0, learning_batch_size=64)
        episode_rewards.append(roll_history["reward_sum"])
        episode_losses.append(roll_history["loss"])
Exemple #9
0
from trickster.agent import A2C
from trickster.rollout import MultiRolling, Trajectory, RolloutConfig
from trickster.model import mlp

cfg = MatchConfig(canvas_size=(100, 100),
                  players_per_side=2,
                  learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT,
                  observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR)

envs = [Match(cfg) for _ in range(8)]
test_env = Match(cfg)

actor, critic = mlp.wide_pg_actor_critic(envs[0].observation_space.shape,
                                         envs[0].action_space.n,
                                         actor_lr=1e-4,
                                         critic_lr=1e-4)

agent = A2C(actor, critic, test_env.action_space, entropy_penalty_coef=0.1)

rcfg = RolloutConfig(max_steps=512, skipframes=2)

training_rollout = MultiRolling(agent, envs, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

training_rollout.fit(episodes=1000,
                     updates_per_episode=512,
                     steps_per_update=1,
                     testing_rollout=testing_rollout)
testing_rollout.render(repeats=10)
Exemple #10
0
from trickster.agent import A2C
from trickster.rollout import Rolling, Trajectory, RolloutConfig
from trickster.model import mlp
from trickster.utility import gymic

env = gymic.rwd_scaled_env()
input_shape = env.observation_space.shape
num_actions = env.action_space.n

actor, critic = mlp.wide_pg_actor_critic(input_shape, num_actions)

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gymic.rwd_scaled_env())

rollout.fit(episodes=1000, updates_per_episode=64, step_per_update=1, testing_rollout=test_rollout, plot_curves=True)
test_rollout.render(repeats=10)
Exemple #11
0
critic = Sequential([
    Dense(400, activation="relu", input_shape=input_shape),
    Dense(300, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(1e-4))

agent = A2C(actor,
            critic,
            action_space=num_actions,
            absolute_memory_limit=10000,
            discount_factor_gamma=0.99,
            entropy_penalty_coef=0.05)

rollout = Rolling(agent.create_workers(1)[0], env)
test_rollout = Trajectory(agent, Lunar())


def train():
    hst = history.History("reward_sum", "actor_loss", "actor_utility",
                          "actor_utility_std", "actor_entropy", "values",
                          "advantages", "critic_loss")

    for episode in range(1, 1001):

        for update in range(1, 4):
            rollout.roll(steps=128, verbose=0, push_experience=True)
            agent_history = agent.fit(batch_size=-1,
                                      verbose=0,
                                      reset_memory=True)
            hst.buffer(**agent_history)
envs = [gymic.rwd_scaled_env("CartPole-v1") for _ in range(8)]
test_env = gymic.rwd_scaled_env("CartPole-v1")

input_shape = envs[0].observation_space.shape
num_actions = envs[0].action_space.n

ann = mlp.wide_mlp_critic_network(input_shape, num_actions, adam_lr=1e-4)

agent = DQN(ann,
            action_space=2,
            memory=Experience(max_length=10000),
            epsilon=1.,
            epsilon_decay=0.99995,
            epsilon_min=0.1,
            discount_factor_gamma=0.98,
            use_target_network=True)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, test_env)

rollout.fit(episodes=500,
            updates_per_episode=128,
            steps_per_update=1,
            update_batch_size=32,
            testing_rollout=test_rollout,
            plot_curves=True)
test_rollout.render()
Exemple #13
0
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.995,
            entropy_penalty_coef=0.0,
            state_preprocessor=lambda state: state / 255.)

episode = 1

reward_memory = deque(maxlen=10)
step_lengths = deque(maxlen=10)
critic_losses = deque(maxlen=10)
actor_losses = deque(maxlen=10)
actor_utility = deque(maxlen=10)
actor_entropy = deque(maxlen=10)

rollout = MultiRolling(agent, envs, rollout_configs=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, test_env, config=RolloutConfig(max_steps=512, skipframes=2))

while 1:
    episode_a_losses = []
    episode_a_utility = []
    episode_a_entropy = []
    episode_c_losses = []

    for update in range(32):
        rollout.roll(steps=2, verbose=0, push_experience=True)
        agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

        episode_a_losses.append(agent_history["actor_loss"])
        episode_a_utility.append(agent_history["actor_utility"])
        episode_a_entropy.append(agent_history["actor_entropy"])
        episode_c_losses.append(agent_history["critic_loss"])
Exemple #14
0
from trickster.agent import REINFORCE
from trickster.rollout import Trajectory, RolloutConfig
from trickster.utility import gymic
from trickster.model import mlp

env = gymic.rwd_scaled_env()
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = mlp.wide_mlp_actor_categorical(input_shape, num_actions, adam_lr=1e-4)
agent = REINFORCE(policy, action_space=num_actions)
rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rollout.fit(episodes=500, rollouts_per_update=1, update_batch_size=-1)
rollout.render(repeats=10)
Exemple #15
0
          input_shape=input_shape,
          kernel_initializer="he_uniform"),
    Dense(16, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer="he_uniform")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
    episode_actor_entropy = []
    episode_critic_loss = []

    for update in range(32):
        rollout.roll(steps=2, verbose=0, push_experience=True)
Exemple #16
0
num_actions = env.action_space.shape[0]

actor, critics = mlp.wide_ddpg_actor_critic(input_shape,
                                            output_dim=num_actions,
                                            action_range=2,
                                            num_critics=2,
                                            actor_lr=5e-4,
                                            critic_lr=5e-4)

agent = TD3(actor,
            critics,
            action_space=spaces.CONTINUOUS,
            memory=Experience(max_length=int(1e4)),
            discount_factor_gamma=0.99,
            action_noise_sigma=0.1,
            action_noise_sigma_decay=1.,
            action_minima=-2,
            action_maxima=2,
            target_noise_sigma=0.2,
            target_noise_clip=0.5)

rollout = Rolling(agent, env)
test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True))

rollout.fit(episodes=1000,
            updates_per_episode=64,
            step_per_update=1,
            update_batch_size=32,
            testing_rollout=test_rollout)
test_rollout.render(repeats=10)
from trickster.utility import visual

env = gym.make("CartPole-v1")
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
policy.compile(loss="categorical_crossentropy", optimizer=Adam(5e-3))

agent = REINFORCE(policy, action_space=num_actions)

rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=300))

rewards = []
losses = []

for episode in range(1, 501):
    rollout_history = rollout.rollout(verbose=0, push_experience=True)
    agent_history = agent.fit(batch_size=-1, verbose=0, reset_memory=True)

    rewards.append(rollout_history["reward_sum"])
    losses.append(agent_history["loss"])

    print("\rEpisode {:>4} RWD: {:>6.1f}, UTILITY: {: >8.4f}".format(
        episode, np.mean(rewards[-10:]), np.mean(losses[-10:])),
          end="")
import keras

from trickster.agent import REINFORCE
from trickster.rollout import Trajectory
from trickster.experience import Experience
from trickster.utility import gymic
from trickster.model import mlp

env = gymic.rwd_scaled_env("LunarLander-v2")
input_shape = env.observation_space.shape
num_actions = env.action_space.n

policy = mlp.wide_mlp_actor_categorical(input_shape, num_actions)
policy.compile(optimizer=keras.optimizers.SGD(lr=2e-4, momentum=0.9),
               loss="categorical_crossentropy")

agent = REINFORCE(policy,
                  action_space=num_actions,
                  memory=Experience(max_length=10000),
                  discount_factor_gamma=0.99)

rollout = Trajectory(agent, env)
rollout.fit(episodes=1000, rollouts_per_update=16, update_batch_size=-1)
rollout.render(repeats=10)