Esempio n. 1
0
actor.compile(loss="categorical_crossentropy", optimizer=Adam(1e-4))

critic = Sequential([
    Dense(16,
          activation="relu",
          input_shape=input_shape,
          kernel_initializer="he_uniform"),
    Dense(16, activation="relu", kernel_initializer="he_uniform"),
    Dense(1, activation="linear", kernel_initializer="he_uniform")
])
critic.compile(loss="mse", optimizer=Adam(5e-4))

agent = A2C(actor,
            critic,
            action_space=env.action_space,
            memory=Experience(max_length=10000),
            discount_factor_gamma=0.98,
            entropy_penalty_coef=0.01)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=300))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_entropy = []
critic_loss = []

for episode in range(1, 1001):
    episode_actor_loss = []
    episode_actor_utility = []
Esempio n. 2
0
    Dense(16, activation="relu"),
    Dense(num_actions, activation="softmax")
])
actor.compile(loss="categorical_crossentropy", optimizer=Adam(ACTOR_ADAM_LR))

critic = Sequential([
    Dense(16, activation="relu", input_shape=input_shape),
    Dense(16, activation="relu"),
    Dense(1, activation="linear")
])
critic.compile(loss="mse", optimizer=Adam(CRITIC_ADAM_LR))

agent = PPO(actor,
            critic,
            action_space=test_env.action_space,
            memory=Experience(max_length=EXPERIENCE_SIZE),
            reward_discount_factor_gamma=DISCOUNT_FACTOR_GAMMA,
            entropy_penalty_coef=ENTROPY_PENALTY_BETA)

rollout = MultiRolling(agent,
                       envs,
                       rollout_configs=RolloutConfig(max_steps=MAX_TIMESTEPS))
test_rollout = Trajectory(agent, gym.make("CartPole-v1"))

rewards = []
actor_loss = []
actor_utility = []
actor_kld = []
actor_entropy = []
critic_loss = []
Esempio n. 3
0
envs = [FakeEnv() for _ in range(10)]
test_env = FakeEnv()

actor = Sequential([  # 200, 160
    Flatten(input_shape=test_env.shape),
    Dense(256),
    BatchNormalization(),
    LeakyReLU(),
    Dense(2, activation="softmax")
])
actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy")

agent = REINFORCE(actor,
                  2,
                  Experience(),
                  discount_factor_gamma=0.99,
                  state_preprocessor=None)

rollout = MultiTrajectory([agent for _ in range(10)], envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
actor_loss = deque(maxlen=100)
actor_utility = deque(maxlen=100)
actor_entropy = deque(maxlen=100)
critic_loss = deque(maxlen=100)

episode = 0

while 1:
Esempio n. 4
0
class TestExperienceSample(unittest.TestCase):
    def setUp(self):
        self.xp = Experience(max_length=100)
        self.sampler = ExperienceSampler(self.xp)
        self.a = np.arange(100)
        self.b = self.a - 100
        self.c = self.a / 10

    def test_sampling_of_next_states(self):
        self.xp.remember(self.a)
        states, next_states = self.sampler.sample(10)

        diff = next_states - states

        self.assertTrue(np.all(diff == 1))

    def test_sampling_when_samples_are_fewer_than_sample_size(self):
        self.xp.remember(self.a)
        states, next_states = self.sampler.sample(200)

        self.assertTrue(len(states) == len(self.a) - 1)
        self.assertTrue(len(next_states) == len(self.a) - 1)

    def test_last_state_doesnt_get_sampled(self):
        self.xp.remember(self.a)
        states, next_states = self.sampler.sample(200)

        self.assertNotIn(self.a[-1], states)

    def test_excluded_state_doesnt_get_sampled(self):
        EXCLUDE = (10, 20, 30)
        self.xp.remember(self.a, exclude=EXCLUDE)

        states, next_states = self.sampler.sample(200)
        for x in EXCLUDE:
            self.assertNotIn(x, states)

    def test_excluding_negative_index_is_correctly_interpreted(self):
        EXCLUDE = (-1, -10, -20)
        TARGET = (99, 90, 80)
        self.xp.remember(self.a, exclude=EXCLUDE)

        states, next_states = self.sampler.sample(200)
        for t in TARGET:
            self.assertNotIn(t, states)

    def test_excluding_works_after_multiple_remembers(self):
        EXCLUDE = (10, 20, 30)
        for _ in range(3):
            self.xp.remember(self.a, exclude=EXCLUDE)

        states, next_states = self.sampler.sample(-1)
        for e in EXCLUDE:
            self.assertNotIn(e, states)
Esempio n. 5
0
 def setUp(self):
     self.xp = Experience(max_length=100)
     self.sampler = ExperienceSampler(self.xp)
     self.a = np.arange(100)
     self.b = self.a - 100
     self.c = self.a / 10
Esempio n. 6
0
from trickster.utility import history, visual
from trickster.model import mlp

cfg = MatchConfig(canvas_size=(128, 128),
                  players_per_side=2,
                  learning_type=MatchConfig.LEARNING_TYPE_SINGLE_AGENT,
                  observation_type=MatchConfig.OBSERVATION_TYPE_VECTOR)

env = Match(cfg)
test_env = Match(cfg)

ann = mlp.wide_dueling_q_network(env.observation_space.shape,
                                 env.action_space.n,
                                 adam_lr=1e-4)

experience = Experience(10000)
agent = DoubleDQN(ann,
                  env.action_space,
                  experience,
                  epsilon=1.,
                  epsilon_decay=1.,
                  epsilon_min=0.1)

rcfg = RolloutConfig(max_steps=1024, skipframes=2)
training_rollout = Rolling(agent, env, rcfg)
testing_rollout = Trajectory(agent, test_env, rcfg)

print("Filling experience...")
while experience.N < 10000:
    training_rollout.roll(steps=32, verbose=0, push_experience=True)
    print(f"\r{experience.N/10000:.2%} 10000/{experience.N}", end="")
Esempio n. 7
0
env = gymic.rwd_scaled_env("LunarLanderContinuous-v2", reward_scale=0.01)

input_shape = env.observation_space.shape
num_actions = env.action_space.shape[0]

actor, critics = mlp.wide_ddpg_actor_critic(input_shape,
                                            output_dim=num_actions,
                                            action_range=2,
                                            num_critics=2,
                                            actor_lr=5e-4,
                                            critic_lr=5e-4)

agent = TD3(actor,
            critics,
            action_space=spaces.CONTINUOUS,
            memory=Experience(max_length=int(1e4)),
            discount_factor_gamma=0.99,
            action_noise_sigma=0.1,
            action_noise_sigma_decay=1.,
            action_minima=-2,
            action_maxima=2,
            target_noise_sigma=0.2,
            target_noise_clip=0.5)

rollout = Rolling(agent, env)
test_rollout = Trajectory(agent, env, RolloutConfig(testing_rollout=True))

rollout.fit(episodes=1000,
            updates_per_episode=64,
            step_per_update=1,
            update_batch_size=32,
Esempio n. 8
0
    def test_experience_constructor_creates_empty_object(self):
        xp = Experience()

        self.assertIsNone(xp.memoirs)
        self.assertEqual(xp.N, 0)
Esempio n. 9
0
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.MaxPool2D(),  # 10
    keras.layers.Conv2D(16, 3, kernel_initializer="he_uniform", padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.MaxPool2D(),  # 5
    keras.layers.GlobalAveragePooling2D(),  # 16
    keras.layers.Dense(4, kernel_initializer="he_uniform"),
    keras.layers.BatchNormalization(),
    keras.layers.ReLU(),
    keras.layers.Dense(2, kernel_initializer="he_uniform")
])
qnet.compile(keras.optimizers.Adam(1e-3), "mse")

agent = DQN(qnet, 2, Experience(max_length=10_000), discount_factor_gamma=0.99,
            epsilon=1.0, epsilon_decay=0.99999, epsilon_min=0.3, use_target_network=True,
            state_preprocessor=None)

rollout = Rolling(agent, env, RolloutConfig(skipframes=2))
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=100)
losses = deque(maxlen=100)

episode = 0

while 1:

    episode += 1
Esempio n. 10
0
    def test_remember_considers_max_size(self):
        xp = Experience(max_length=100)
        xp.remember(np.arange(120))

        self.assertTrue(xp.N, 100)
        self.assertListEqual(xp.memoirs[0].tolist(), list(range(20, 120)))
Esempio n. 11
0
    def test_experience_remembers_array(self):
        xp = Experience()
        xp.remember(np.arange(100))

        self.assertEqual(xp.N, 100)
        self.assertListEqual(xp.memoirs[0].tolist(), list(range(100)))
Esempio n. 12
0
    def test_experience_constructor_considers_max_size_argument(self):
        xp = Experience(max_length=3)

        self.assertEqual(xp.max_length, 3)
Esempio n. 13
0
    def reset(self):
        self.initial_state = self.env.reset()
        return self.empty


envs = [FakeEnv() for _ in range(10)]
test_env = FakeEnv()

actor = Sequential([  # 200, 160
    Flatten(input_shape=test_env.shape),
    Dense(200, activation="relu"),
    Dense(2, activation="softmax")
])
actor.compile(RMSprop(1e-4, rho=0.99), "categorical_crossentropy")

agent = REINFORCE(actor, 2, Experience(), discount_factor_gamma=0.99,
                  state_preprocessor=None)

rollout = MultiTrajectory(agent, envs)
test_rollout = Trajectory(agent, test_env)

rewards = deque(maxlen=10)
actor_loss = deque(maxlen=80)
actor_utility = deque(maxlen=80)
actor_entropy = deque(maxlen=80)
critic_loss = deque(maxlen=80)

episode = 0

while 1:
Esempio n. 14
0
canvas_shape, action_shape = env.neurons_required

actor_input = Input(shape=[64, 64, 3], name="actor_input")
critic_input = Input(shape=[64, 64, 3], name="critic_input")

critic_stream = Flatten()(critic_input)
critic_stream = Dense(64, activation="tanh")(critic_stream)
critic_stream = BatchNormalization()(critic_stream)
critic_stream = Dense(32, activation="tanh")(critic_stream)
critic_stream = BatchNormalization()(critic_stream)
value_estimate = Dense(NUM_MOVES, activation="softmax")(critic_stream)

critic = Model(critic_input, value_estimate, name="Critic")
critic.compile(Adam(5e-4), "mse")

agent = DQN(critic, action_space=MOVES, memory=Experience(max_length=10000), discount_factor_gamma=0.99, epsilon=0.7,
            state_preprocessor=lambda state: state / 255. - 0.5)

rollout = Rolling(agent, env, config=RolloutConfig(max_steps=512, skipframes=2))
test_rollout = Trajectory(agent, env, config=RolloutConfig(max_steps=512, skipframes=2))

episode = 0
reward_memory = deque(maxlen=10)
losses = deque(maxlen=10)

while 1:
    episode += 1

    episode_losses = []
    for update in range(32):
        rollout.roll(steps=4, verbose=0, push_experience=True)