Exemple #1
0
    def __init__(self, base_folder: str, models_folder_name: str,
                 load_pi_predict_model: bool, load_v_predict_model: bool,
                 save_episodes: bool, save_episodes_folder: str,
                 save_gifs: bool, gifs_folder_name: str,
                 save_pi_predict_models: bool, save_v_predict_models: bool,
                 run_indefinitely: bool, max_nb_episodes: int,
                 use_keras_gym_train_monitor: bool):
        """It makes little sense to have both save_episodes and save_gifs set to True, since episodes can be watched
        (in better quality, even though the files are smaller) using WatchReplay.py."""

        self.base_folder = base_folder
        self.models_folder_name = models_folder_name
        self.save_episodes = save_episodes
        self.save_episodes_folder = save_episodes_folder
        self.should_save_gifs = save_gifs
        self.gifs_folder_name = gifs_folder_name
        self.should_save_pi_predict_models = save_pi_predict_models
        self.should_save_v_predict_models = save_v_predict_models
        self.run_indefinitely = run_indefinitely
        self.max_nb_episodes = max_nb_episodes
        self.use_keras_gym_train_monitor = use_keras_gym_train_monitor

        self.models_folder = os.path.join(self.base_folder, self.models_folder_name)
        self.gifs_folder = os.path.join(self.base_folder, self.gifs_folder_name)

        if save_episodes_folder and not os.path.exists(save_episodes_folder):
            os.makedirs(save_episodes_folder)
        if models_folder_name and not os.path.exists(self.models_folder):
            os.makedirs(self.models_folder)
        if save_gifs and self.gifs_folder and not os.path.exists(self.gifs_folder):
            os.makedirs(self.gifs_folder)

        self.env = gym.make('Riverraid-v0')
        self.env = km.wrappers.ImagePreprocessor(self.env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH,
                                                 grayscale=RL_PREPROCESS_GRAYSCALE)
        self.env = km.wrappers.FrameStacker(self.env, num_frames=RL_PREPROCESS_NUM_FRAMES)
        if use_keras_gym_train_monitor:
            self.env = km.wrappers.TrainMonitor(self.env)

        # show logs from TrainMonitor
        km.enable_logging()

        # function approximators
        self.func = km.predefined.AtariFunctionApproximator(self.env)
        self.pi = km.SoftmaxPolicy(self.func, update_strategy=RL_PI_UPDATE_STRATEGY)  # PPO

        self.v = km.V(self.func, gamma=RLTrainer.GAMMA,
                      bootstrap_with_target_model=RLTrainer.BOOTSTRAP_WITH_TARGET_MODEL,
                      bootstrap_n=RLTrainer.BOOTSTRAP_N)

        self.actor_critic = km.ActorCritic(self.pi, self.v)

        # we'll use this to temporarily store our experience
        self.buffer = km.caching.ExperienceReplayBuffer.from_value_function(
            value_function=self.v, capacity=RLTrainer.BUFFER_CAPACITY, batch_size=RLTrainer.BUFFER_BATCH_SIZE)

        if load_pi_predict_model:
            self.load_pi_predict_model_weights()
        if load_v_predict_model:
            self.load_v_predict_model_weights()
Exemple #2
0
def test_atari_ppo():
    # env with preprocessing
    env = gym.make('PongDeterministic-v4')
    env = km.wrappers.ImagePreprocessor(env,
                                        height=105,
                                        width=80,
                                        grayscale=True)
    env = km.wrappers.FrameStacker(env, num_frames=3)
    env = km.wrappers.TrainMonitor(env)

    # show logs from TrainMonitor
    km.enable_logging()

    func = Func(env, lr=0.00025)
    pi = km.SoftmaxPolicy(function_approximator=func, update_strategy='ppo')
    v = km.V(function_approximator=func,
             gamma=0.99,
             bootstrap_n=10,
             bootstrap_with_target_model=True)
    actor_critic = km.ActorCritic(pi, v)

    # we'll use this to temporarily store our experience
    buffer = km.caching.ExperienceReplayBuffer.from_value_function(
        value_function=v, capacity=256, batch_size=64)

    # run episodes
    while env.T < 500000:
        s = env.reset()

        for t in range(env.spec.max_episode_steps):
            a = pi(s, use_target_model=True)  # target_model == pi_old
            s_next, r, done, info = env.step(a)

            buffer.add(s, a, r, done, env.ep)

            if len(buffer) >= buffer.capacity:
                # use 4 epochs per round
                num_batches = int(4 * buffer.capacity / buffer.batch_size)
                for _ in range(num_batches):
                    actor_critic.batch_update(*buffer.sample())
                buffer.clear()

                # soft update (tau=1 would be a hard update)
                actor_critic.sync_target_model(tau=0.1)

            if done:
                break

            s = s_next

        if env.G > 0:
            break

    assert env.T < 500000, "test_atari_ppo didn't converge"
Exemple #3
0
    def __init__(self, predict_model_weights_path: str):
        env = gym.make("Riverraid-v0")  # Dummy so that we can make pi below.
        env = keras_gym.wrappers.ImagePreprocessor(
            env,
            height=RL_PREPROCESS_HEIGHT,
            width=RL_PREPROCESS_WIDTH,
            grayscale=RL_PREPROCESS_GRAYSCALE)
        # The actual preprocessing will be done using the preprocess parameter for super().__init__.
        # This way we can take 'normal' screens as input.
        env = keras_gym.wrappers.FrameStacker(
            env, num_frames=RL_PREPROCESS_NUM_FRAMES)
        env = keras_gym.wrappers.TrainMonitor(env)

        func = keras_gym.predefined.AtariFunctionApproximator(env)
        self.pi = keras_gym.SoftmaxPolicy(
            func, update_strategy=RL_PI_UPDATE_STRATEGY)
        self.pi.predict_model.load_weights(predict_model_weights_path)

        super().__init__(lambda screen: self.pi(screen), False, True,
                         RL_PREPROCESS_NUM_FRAMES)
        self.model.save('REINFORCE_model.h5')

    def load_model(self, path):
        '''loads a trained model from path'''
        return load_model(path)


if __name__ == "__main__":
    """agent=REINFORCE(env)
    agent.train(100)
    import matplotlib.pyplot as plt
    import math"""

    env = KSPPilot()
    function_approximator = MLP(env, lr=0.1)
    pi = km.SoftmaxPolicy(function_approximator, update_strategy='vanilla')
    v = km.V(function_approximator, gamma=0.9, bootstrap_n=1)
    # combine them into a single actor-critic
    actor_critic = km.ActorCritic(pi, v)
    for ep in range(100):
        s = env.reset()

        for t in range(10000):
            a = pi(s, use_target_model=True)
            s_next, r, done, info = env.step(a)

            # small incentive to keep moving
            if np.array_equal(s_next, s):
                r = -0.1

            actor_critic.update(s, a, r, done)
Exemple #5
0
env = km.wrappers.TrainMonitor(env)

# show logs from TrainMonitor
km.enable_logging()


class LinearFunc(km.FunctionApproximator):
    """ linear function approximator (body only does one-hot encoding) """
    def body(self, S):
        one_hot_encoding = keras.layers.Lambda(lambda x: K.one_hot(x, 16))
        return one_hot_encoding(S)


# define function approximators
func = LinearFunc(env, lr=0.01)
pi = km.SoftmaxPolicy(func, update_strategy='vanilla')
cache = km.caching.MonteCarloCache(env, gamma=0.99)


# static parameters
num_episodes = 250
num_steps = 30


# train
for ep in range(num_episodes):
    s = env.reset()
    cache.reset()

    for t in range(num_steps):
        a = pi(s)
Exemple #6
0
        def diff_transform(S):
            S = K.cast(S, 'float32') / 255
            M = km.utils.diff_transform_matrix(num_frames=3)
            return K.dot(S, M)

        X = Lambda(diff_transform)(S)
        X = Conv2D(filters=16, kernel_size=8, strides=4, activation='relu')(X)
        X = Conv2D(filters=32, kernel_size=4, strides=2, activation='relu')(X)
        X = Flatten()(X)
        X = Dense(units=256, activation='relu')(X)
        return X


# function approximators
func = Func(env, lr=0.00025)
pi = km.SoftmaxPolicy(func, update_strategy='ppo')
v = km.V(func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True)
actor_critic = km.ActorCritic(pi, v)

# we'll use this to temporarily store our experience
buffer = km.caching.ExperienceReplayBuffer.from_value_function(
    value_function=v, capacity=256, batch_size=64)

# run episodes
while env.T < 3000000:
    s = env.reset()

    for t in range(env.spec.max_episode_steps):
        a = pi(s, use_target_model=True)  # target_model == pi_old
        s_next, r, done, info = env.step(a)
Exemple #7
0
import keras_gym as km
import numpy as np

env = km.envs.ConnectFourEnv()
env = km.wrappers.TrainMonitor(env)

# show logs from TrainMonitor
km.enable_logging()

# function approximators
func = km.predefined.ConnectFourFunctionApproximator(env, lr=0.001)
pi = km.SoftmaxPolicy(func, update_strategy='cross_entropy')
v = km.V(func, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True)
ac = km.ActorCritic(pi, v)
cache = km.caching.MonteCarloCache(env, gamma=1)

# state_id = '20400000000000000099'
# state_id = '2020000d2c2a86ce6400'
# state_id = '10600000000000005609'  # attack
# state_id = '20600000000000004d7e'  # defend
# state_id = '106000000001a021e87f'
# n = km.planning.MCTSNode(ac, state_id=state_id, random_seed=7)
n = km.planning.MCTSNode(ac, random_seed=17, c_puct=3.5)

n.env.render()

for ep in range(1000):
    n.reset()
    for t in range(env.max_time_steps):
        n.search(n=14)
        n.show(2)
Exemple #8
0
        return X


# environment [https://github.com/axb2035/gym-chase]
env = gym.make('Chase-v0')
env = ChasePreprocessor(env)
env = km.wrappers.FrameStacker(env, num_frames=3)
env = km.wrappers.TrainMonitor(env)

# show logs from TrainMonitor
km.enable_logging()

# function approximators
cnn = CNN(env, lr=0.00025)
pi = km.SoftmaxPolicy(cnn, update_strategy='ppo')
v = km.V(cnn, gamma=0.99, bootstrap_n=10, bootstrap_with_target_model=True)
actor_critic = km.ActorCritic(pi, v)
buffer = km.caching.ExperienceReplayBuffer.from_value_function(
    value_function=v, capacity=256, batch_size=16)

for ep in range(10000000):
    s = env.reset()

    for t in range(1000):
        a = actor_critic.policy(s, use_target_model=True)
        s_next, r, done, info = env.step(a)

        buffer.add(s, a, r, done, ep)

        if len(buffer) >= buffer.capacity: