Ejemplo n.º 1
0
    def __init__(self, base_folder: str, models_folder_name: str,
                 load_pi_predict_model: bool, load_v_predict_model: bool,
                 save_episodes: bool, save_episodes_folder: str,
                 save_gifs: bool, gifs_folder_name: str,
                 save_pi_predict_models: bool, save_v_predict_models: bool,
                 run_indefinitely: bool, max_nb_episodes: int,
                 use_keras_gym_train_monitor: bool):
        """It makes little sense to have both save_episodes and save_gifs set to True, since episodes can be watched
        (in better quality, even though the files are smaller) using WatchReplay.py."""

        self.base_folder = base_folder
        self.models_folder_name = models_folder_name
        self.save_episodes = save_episodes
        self.save_episodes_folder = save_episodes_folder
        self.should_save_gifs = save_gifs
        self.gifs_folder_name = gifs_folder_name
        self.should_save_pi_predict_models = save_pi_predict_models
        self.should_save_v_predict_models = save_v_predict_models
        self.run_indefinitely = run_indefinitely
        self.max_nb_episodes = max_nb_episodes
        self.use_keras_gym_train_monitor = use_keras_gym_train_monitor

        self.models_folder = os.path.join(self.base_folder, self.models_folder_name)
        self.gifs_folder = os.path.join(self.base_folder, self.gifs_folder_name)

        if save_episodes_folder and not os.path.exists(save_episodes_folder):
            os.makedirs(save_episodes_folder)
        if models_folder_name and not os.path.exists(self.models_folder):
            os.makedirs(self.models_folder)
        if save_gifs and self.gifs_folder and not os.path.exists(self.gifs_folder):
            os.makedirs(self.gifs_folder)

        self.env = gym.make('Riverraid-v0')
        self.env = km.wrappers.ImagePreprocessor(self.env, height=RL_PREPROCESS_HEIGHT, width=RL_PREPROCESS_WIDTH,
                                                 grayscale=RL_PREPROCESS_GRAYSCALE)
        self.env = km.wrappers.FrameStacker(self.env, num_frames=RL_PREPROCESS_NUM_FRAMES)
        if use_keras_gym_train_monitor:
            self.env = km.wrappers.TrainMonitor(self.env)

        # show logs from TrainMonitor
        km.enable_logging()

        # function approximators
        self.func = km.predefined.AtariFunctionApproximator(self.env)
        self.pi = km.SoftmaxPolicy(self.func, update_strategy=RL_PI_UPDATE_STRATEGY)  # PPO

        self.v = km.V(self.func, gamma=RLTrainer.GAMMA,
                      bootstrap_with_target_model=RLTrainer.BOOTSTRAP_WITH_TARGET_MODEL,
                      bootstrap_n=RLTrainer.BOOTSTRAP_N)

        self.actor_critic = km.ActorCritic(self.pi, self.v)

        # we'll use this to temporarily store our experience
        self.buffer = km.caching.ExperienceReplayBuffer.from_value_function(
            value_function=self.v, capacity=RLTrainer.BUFFER_CAPACITY, batch_size=RLTrainer.BUFFER_BATCH_SIZE)

        if load_pi_predict_model:
            self.load_pi_predict_model_weights()
        if load_v_predict_model:
            self.load_v_predict_model_weights()
Ejemplo n.º 2
0
def test_atari_ppo():
    # env with preprocessing
    env = gym.make('PongDeterministic-v4')
    env = km.wrappers.ImagePreprocessor(env,
                                        height=105,
                                        width=80,
                                        grayscale=True)
    env = km.wrappers.FrameStacker(env, num_frames=3)
    env = km.wrappers.TrainMonitor(env)

    # show logs from TrainMonitor
    km.enable_logging()

    func = Func(env, lr=0.00025)
    pi = km.SoftmaxPolicy(function_approximator=func, update_strategy='ppo')
    v = km.V(function_approximator=func,
             gamma=0.99,
             bootstrap_n=10,
             bootstrap_with_target_model=True)
    actor_critic = km.ActorCritic(pi, v)

    # we'll use this to temporarily store our experience
    buffer = km.caching.ExperienceReplayBuffer.from_value_function(
        value_function=v, capacity=256, batch_size=64)

    # run episodes
    while env.T < 500000:
        s = env.reset()

        for t in range(env.spec.max_episode_steps):
            a = pi(s, use_target_model=True)  # target_model == pi_old
            s_next, r, done, info = env.step(a)

            buffer.add(s, a, r, done, env.ep)

            if len(buffer) >= buffer.capacity:
                # use 4 epochs per round
                num_batches = int(4 * buffer.capacity / buffer.batch_size)
                for _ in range(num_batches):
                    actor_critic.batch_update(*buffer.sample())
                buffer.clear()

                # soft update (tau=1 would be a hard update)
                actor_critic.sync_target_model(tau=0.1)

            if done:
                break

            s = s_next

        if env.G > 0:
            break

    assert env.T < 500000, "test_atari_ppo didn't converge"
Ejemplo n.º 3
0
import gym
import keras_gym as km
from tensorflow.keras.layers import Conv2D, Lambda, Dense, Flatten
from tensorflow.keras import backend as K

# env with preprocessing
env = gym.make('PongDeterministic-v4')
env = km.wrappers.ImagePreprocessor(env, height=105, width=80, grayscale=True)
env = km.wrappers.FrameStacker(env, num_frames=3)
env = km.wrappers.TrainMonitor(env, tensorboard_dir='data/sac/tensorboard')

# show logs from TrainMonitor
km.enable_logging()


class Func(km.FunctionApproximator):
    def body(self, S):
        def diff_transform(S):
            S = K.cast(S, 'float32') / 255
            M = km.utils.diff_transform_matrix(num_frames=3)
            return K.dot(S, M)

        X = Lambda(diff_transform)(S)
        X = Conv2D(filters=16, kernel_size=8, strides=4, activation='relu')(X)
        X = Conv2D(filters=32, kernel_size=4, strides=2, activation='relu')(X)
        X = Flatten()(X)
        X = Dense(units=256, activation='relu')(X)
        return X


# function approximators