if args.stack_frames > 1: eval_venv = VecFrameStack(eval_venv, args.stack_frames) eval_callback = callbacks.EvalCallback_with_prefix( eval_env=eval_venv, best_model_save_path=str(common.output_data_folder / "models" / saved_model_filename), prefix=f"{test_body}", n_eval_episodes=3, eval_freq=1e3, # will implicitly multiplied by (train_num_envs) deterministic=True, ) all_callbacks.append(eval_callback) if args.with_checkpoint: checkpoint_callback = CheckpointCallback( save_freq=1000, save_path=str(common.output_data_folder / 'checkpoints'), name_prefix=args.train_bodies) all_callbacks.append(checkpoint_callback) if args.vec_normalize: save_vec_callback = callbacks.SaveVecNormalizeCallback( save_freq=1000, save_path=str(common.output_data_folder / 'checkpoints'), name_prefix=args.train_bodies) all_callbacks.append(save_vec_callback) hyperparams['policy_kwargs']['activation_fn'] = MyThreshold model = PPO('MlpPolicy', venv, verbose=1, tensorboard_log=str(common.output_data_folder / "tensorboard" /
if not args.recodex and args.total_timesteps > 0: policy_kwargs = dict(net_arch=[args.controller_size] * args.controller_depth) if args.load_from is None: if args.discrete_actions: model = DQN('MlpPolicy', env, tau=args.tau, learning_rate=args.lr, exploration_initial_eps=args.epsilon, exploration_final_eps=args.epsilon_final, exploration_fraction=args.epsilon_final_at, train_freq=args.train_freq, batch_size=args.batch_size, buffer_size=args.buffer_size, gamma=args.gamma, target_update_interval=args.target_update_interval, learning_starts=args.learning_starts, policy_kwargs=policy_kwargs, verbose=1) else: # The noise objects for DDPG n_actions = env.action_space.shape[-1] # action_noise = NormalActionNoise(mean=np.zeros( # n_actions), sigma=args.action_noise * np.ones(n_actions)) # model = DDPG('MlpPolicy', env, action_noise=action_noise, batch_size=args.batch_size, # buffer_size=args.buffer_size, gamma=args.gamma, policy_kwargs=policy_kwargs, verbose=1) checkpoint_on_event = CheckpointCallback(save_freq=1, name_prefix=get_params_str(args.seed) , save_path='./checkpoints/') event_callback = EveryNTimesteps(n_steps=args.checkpoint_every, callback=checkpoint_on_event) model.learn(total_timesteps=args.total_timesteps, log_interval=1, callback=event_callback) if(args.save_to): model.save(args.save_to) else: model.save("saved_models/" + get_params_str(f"envSeed-{args.seed}")) if args.evaluate_for: evaluate(model, env)
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # DQN only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100, warn=False, ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) # Stop training if max number of episodes is reached callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1) callback = CallbackList([ checkpoint_callback, eval_callback, event_callback, callback_max_episodes ]) model.learn(500, callback=callback) # Check access to local variables assert model.env.observation_space.contains(callback.locals["new_obs"][0]) # Check that the child callback was called assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"] assert event_callback.locals["new_obs"] is callback.locals["new_obs"] assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"] # Check that internal callback counters match models' counters assert event_callback.num_timesteps == model.num_timesteps assert event_callback.n_calls == model.num_timesteps model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) # Testing models that support multiple envs if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 # Pendulum-v0 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32])) callback_max_episodes = StopTrainingOnMaxEpisodes( max_episodes=max_episodes, verbose=1) callback = CallbackList([callback_max_episodes]) model.learn(1000, callback=callback) # Check that the actual number of episodes and timesteps per env matches the expected one episodes_per_env = callback_max_episodes.n_episodes // n_envs assert episodes_per_env == max_episodes timesteps_per_env = model.num_timesteps // n_envs assert timesteps_per_env == max_episode_length if os.path.exists(log_folder): shutil.rmtree(log_folder)
def main(env, args): global model # Fix random seeds and number of threads np.random.seed(args.seed) if args.recodex: models = [] for path in args.load_from: models.append(SAC.load(path)) while True: state, done = env.reset(start_evaluation=True), False ret = 0 while not done: action = np.sum(np.array( list( map(lambda m: m.predict(state, deterministic=True)[0], models))), axis=0) / len(models)**0.5 # print(action) # action, _states = model.predict(state, deterministic=True) # action, _states = model.predict(state) ## TODO delete before submitting if not args.no_render: env.render() state, reward, done, _ = env.step(action) ret += reward print("Episode return:", ret) else: tensorboard_log_dir = None if args.tensorboard_log_dir is None else os.path.join( args.tensorboard_log_dir, get_exp_name()) model = SAC("MlpPolicy", env, learning_rate=lr_schedule, buffer_size=args.buffer_size, learning_starts=args.learning_starts, n_episodes_rollout=args.train_episodes, batch_size=args.batch_size, tau=args.tau, gamma=args.gamma, train_freq=args.train_freq, gradient_steps=args.gradient_steps, ent_coef="auto" if args.ent_coef == "auto" else float(args.ent_coef), use_sde=False, policy_kwargs=dict(log_std_init=-3, net_arch=args.net_arch, use_expln=True), tensorboard_log=tensorboard_log_dir, rew_skip_thres=args.rew_skip_thres, seed=args.seed) model.verbose = 2 callbacks = [ CheckpointCallback(20000, "checkpoints", name_prefix=get_exp_name()), EvalCallback( gym.make(getEnvName()), callback_on_new_best=SaveBestModelCallback( save_path="best/" + get_exp_name() + "_best_model.zip"), eval_freq=20000, n_eval_episodes=5, deterministic=True), EpisodeCallback(env, model) ] print(args.log_interval) model.learn(args.timesteps, log_interval=args.log_interval, callback=callbacks) # Final evaluation env = wrappers.EvaluationWrapper(gym.make(getEnvName()), evaluate_for=200, seed=args.seed) while True: state, done = env.reset(start_evaluation=True), False while not done: action, _states = model.predict(state, deterministic=True) state, reward, done, _ = env.step(action) model.save(get_exp_name())
from stable_baselines3 import PPO import numpy as np import gym from stable_baselines3.common.callbacks import CheckpointCallback from utils import * gamename = "MortalKombatII-Genesis" if __name__ == "__main__": n_cpu = 16 env = SubprocVecEnv([make_env] * n_cpu) env = VecFrameStack(env, n_stack=4) model = PPO(CnnPolicy, env, n_steps=128, verbose=1, tensorboard_log="./tboard_log") # Use this if you want to continue training a saved model # model = PPO.load("training_checkpoints/your_model.zip", tensorboard_log="./tboard_log") # model.set_env(env) checkpoint_callback = CheckpointCallback( save_freq=1000, save_path='./training_checkpoints', name_prefix='subzero-ppo2') model.learn(total_timesteps=20000000, callback=checkpoint_callback) model.save('subzero-ppo2') env.close()