Ejemplo n.º 1
0
        if args.stack_frames > 1:
            eval_venv = VecFrameStack(eval_venv, args.stack_frames)
        eval_callback = callbacks.EvalCallback_with_prefix(
            eval_env=eval_venv,
            best_model_save_path=str(common.output_data_folder / "models" /
                                     saved_model_filename),
            prefix=f"{test_body}",
            n_eval_episodes=3,
            eval_freq=1e3,  # will implicitly multiplied by (train_num_envs)
            deterministic=True,
        )
        all_callbacks.append(eval_callback)

    if args.with_checkpoint:
        checkpoint_callback = CheckpointCallback(
            save_freq=1000,
            save_path=str(common.output_data_folder / 'checkpoints'),
            name_prefix=args.train_bodies)
        all_callbacks.append(checkpoint_callback)
        if args.vec_normalize:
            save_vec_callback = callbacks.SaveVecNormalizeCallback(
                save_freq=1000,
                save_path=str(common.output_data_folder / 'checkpoints'),
                name_prefix=args.train_bodies)
            all_callbacks.append(save_vec_callback)

    hyperparams['policy_kwargs']['activation_fn'] = MyThreshold

    model = PPO('MlpPolicy',
                venv,
                verbose=1,
                tensorboard_log=str(common.output_data_folder / "tensorboard" /
Ejemplo n.º 2
0
    if not args.recodex and args.total_timesteps > 0:

        policy_kwargs = dict(net_arch=[args.controller_size] * args.controller_depth)


        if args.load_from is None:
            if args.discrete_actions:
                model = DQN('MlpPolicy', env, tau=args.tau, learning_rate=args.lr, exploration_initial_eps=args.epsilon, exploration_final_eps=args.epsilon_final, exploration_fraction=args.epsilon_final_at, train_freq=args.train_freq,
                            batch_size=args.batch_size, buffer_size=args.buffer_size, gamma=args.gamma, target_update_interval=args.target_update_interval, learning_starts=args.learning_starts, policy_kwargs=policy_kwargs, verbose=1)

            else:
                # The noise objects for DDPG
                n_actions = env.action_space.shape[-1]
                # action_noise = NormalActionNoise(mean=np.zeros(
                    # n_actions), sigma=args.action_noise * np.ones(n_actions))

                # model = DDPG('MlpPolicy', env, action_noise=action_noise, batch_size=args.batch_size,
                            #  buffer_size=args.buffer_size, gamma=args.gamma, policy_kwargs=policy_kwargs, verbose=1)

        checkpoint_on_event = CheckpointCallback(save_freq=1, name_prefix=get_params_str(args.seed) , save_path='./checkpoints/')
        event_callback = EveryNTimesteps(n_steps=args.checkpoint_every, callback=checkpoint_on_event)

        model.learn(total_timesteps=args.total_timesteps, log_interval=1, callback=event_callback)

        if(args.save_to):
            model.save(args.save_to)
        else:
            model.save("saved_models/" + get_params_str(f"envSeed-{args.seed}"))

    if args.evaluate_for:
        evaluate(model, env)
Ejemplo n.º 3
0
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy",
                        env_name,
                        policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(
        eval_env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=log_folder,
        log_path=log_folder,
        eval_freq=100,
        warn=False,
    )
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=log_folder,
                                             name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100,
                                                      verbose=1)

    callback = CallbackList([
        checkpoint_callback, eval_callback, event_callback,
        callback_max_episodes
    ])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy",
                            envs,
                            policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(
            max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
Ejemplo n.º 4
0
def main(env, args):
    global model
    # Fix random seeds and number of threads
    np.random.seed(args.seed)

    if args.recodex:
        models = []
        for path in args.load_from:
            models.append(SAC.load(path))

        while True:
            state, done = env.reset(start_evaluation=True), False
            ret = 0
            while not done:
                action = np.sum(np.array(
                    list(
                        map(lambda m: m.predict(state, deterministic=True)[0],
                            models))),
                                axis=0) / len(models)**0.5
                # print(action)

                # action, _states = model.predict(state, deterministic=True)
                # action, _states = model.predict(state)

                ## TODO delete before submitting
                if not args.no_render:
                    env.render()

                state, reward, done, _ = env.step(action)
                ret += reward

            print("Episode return:", ret)

    else:

        tensorboard_log_dir = None if args.tensorboard_log_dir is None else os.path.join(
            args.tensorboard_log_dir, get_exp_name())

        model = SAC("MlpPolicy",
                    env,
                    learning_rate=lr_schedule,
                    buffer_size=args.buffer_size,
                    learning_starts=args.learning_starts,
                    n_episodes_rollout=args.train_episodes,
                    batch_size=args.batch_size,
                    tau=args.tau,
                    gamma=args.gamma,
                    train_freq=args.train_freq,
                    gradient_steps=args.gradient_steps,
                    ent_coef="auto"
                    if args.ent_coef == "auto" else float(args.ent_coef),
                    use_sde=False,
                    policy_kwargs=dict(log_std_init=-3,
                                       net_arch=args.net_arch,
                                       use_expln=True),
                    tensorboard_log=tensorboard_log_dir,
                    rew_skip_thres=args.rew_skip_thres,
                    seed=args.seed)

        model.verbose = 2

        callbacks = [
            CheckpointCallback(20000,
                               "checkpoints",
                               name_prefix=get_exp_name()),
            EvalCallback(
                gym.make(getEnvName()),
                callback_on_new_best=SaveBestModelCallback(
                    save_path="best/" + get_exp_name() + "_best_model.zip"),
                eval_freq=20000,
                n_eval_episodes=5,
                deterministic=True),
            EpisodeCallback(env, model)
        ]

        print(args.log_interval)
        model.learn(args.timesteps,
                    log_interval=args.log_interval,
                    callback=callbacks)

        # Final evaluation
        env = wrappers.EvaluationWrapper(gym.make(getEnvName()),
                                         evaluate_for=200,
                                         seed=args.seed)

        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                action, _states = model.predict(state, deterministic=True)
                state, reward, done, _ = env.step(action)

        model.save(get_exp_name())
from stable_baselines3 import PPO
import numpy as np
import gym
from stable_baselines3.common.callbacks import CheckpointCallback
from utils import *

gamename = "MortalKombatII-Genesis"

if __name__ == "__main__":
    n_cpu = 16

    env = SubprocVecEnv([make_env] * n_cpu)
    env = VecFrameStack(env, n_stack=4)

    model = PPO(CnnPolicy,
                env,
                n_steps=128,
                verbose=1,
                tensorboard_log="./tboard_log")
    # Use this if you want to continue training a saved model
    # model = PPO.load("training_checkpoints/your_model.zip", tensorboard_log="./tboard_log")
    # model.set_env(env)

    checkpoint_callback = CheckpointCallback(
        save_freq=1000,
        save_path='./training_checkpoints',
        name_prefix='subzero-ppo2')
    model.learn(total_timesteps=20000000, callback=checkpoint_callback)
    model.save('subzero-ppo2')
    env.close()