def test_recurrent_eval_callback(): env_id = 'Pendulum-v0' # Create envs env = make_vec_env(env_id, n_envs=4) eval_env = make_vec_env(env_id, n_envs=1) # Create RL model model = PPO2('MlpLstmPolicy', env) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=LOG_FOLDER, log_path=LOG_FOLDER, eval_freq=100) model.learn(300, callback=eval_callback) # Cleanup if os.path.exists(LOG_FOLDER): shutil.rmtree(LOG_FOLDER)
def test_callbacks(model_class): env_id = 'Pendulum-v0' if model_class in [ACER, DQN]: env_id = 'CartPole-v1' allowed_failures = [] # Number of training timesteps is too short # otherwise, the training would take too long, or would require # custom parameter per algorithm if model_class in [PPO1, DQN, TRPO]: allowed_failures = ['rollout_end'] # Create RL model model = model_class('MlpPolicy', env_id) checkpoint_callback = CheckpointCallback(save_freq=500, save_path=LOG_FOLDER) # For testing: use the same training env eval_env = model.get_env() # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, best_model_save_path=LOG_FOLDER, log_path=LOG_FOLDER, eval_freq=100) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=LOG_FOLDER, name_prefix='event') event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) callback = CallbackList( [checkpoint_callback, eval_callback, event_callback]) model.learn(500, callback=callback) model.learn(200, callback=None) custom_callback = CustomCallback() model.learn(200, callback=custom_callback) # Check that every called were executed custom_callback.validate(allowed_failures=allowed_failures) # Transform callback into a callback list automatically custom_callback = CustomCallback() model.learn(500, callback=[checkpoint_callback, eval_callback, custom_callback]) # Check that every called were executed custom_callback.validate(allowed_failures=allowed_failures) # Automatic wrapping, old way of doing callbacks model.learn(200, callback=lambda _locals, _globals: True) # Cleanup if os.path.exists(LOG_FOLDER): shutil.rmtree(LOG_FOLDER)
def train(self, tensorboard_log: str) -> None: try: self.load_model(tensorboard_log=tensorboard_log) except: self.create_model(tensorboard_log=tensorboard_log) # Stop training if reward gets close to zero callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-0.1, verbose=1) eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1) # Save model at regular time intervals checkpoint_callback = CheckpointCallback( save_freq=1000, save_path='./model_checkpoints/') # Chain callbacks together callback = CallbackList([eval_callback, checkpoint_callback]) # Train model self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run") # Save trained model print("Training is finished!")
def init_env(env_id): if parallel: env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) reward_env = SubprocVecEnv([make_env(env_id, i) for i in range(1)]) else: env = DummyVecEnv([make_env(env_id, i) for i in range(num_cpu)]) reward_env = DummyVecEnv([make_env(env_id, i) for i in range(1)]) if terminate_early: callback_on_best = StopTrainingOnRewardThreshold( reward_threshold=0.85, verbose=verbose) eval_callback = EvalCallback(reward_env, callback_on_new_best=callback_on_best, eval_freq=10_000, verbose=verbose) return env, reward_env, eval_callback else: return env, reward_env, None
def train(self): # Load latest model if available try: path = os.getcwd() os.chdir(os.getcwd() + '/model_checkpoints') files = [x for x in os.listdir() if x.endswith(".zip")] num = [] for file in files: num.append([int(x) for x in file.split('_') if x.isdigit()][0]) filename = "rl_model_" + str(max(num)) + "_steps.zip" print("Tentative: " + filename) self.model = PPO2.load(load_path=filename, env=DummyVecEnv([lambda: self.env]), tensorboard_log='./a2c_rasp_tensorboard/') print("Successfully loaded the previous model: " + filename) os.chdir(path) except: # Vector-encode our new environment env = DummyVecEnv([lambda: self.env]) # Create new model self.model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log='./a2c_rasp_tensorboard/') print("Successfully created new model") # Stop training if reward get close to zero callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1e-2, verbose=1) eval_callback = EvalCallback(self.env, callback_on_new_best=callback_on_best, verbose=1) # Save model at regular time intervals checkpoint_callback = CheckpointCallback(save_freq=2000, save_path='./model_checkpoints/') # Chain callbacks together callback = CallbackList([eval_callback, checkpoint_callback]) # Train model episode = 1 while episode < 10: # Update location of red dot _ = self.env.square if self.env.trainable: print("Beginning episode number {}".format(episode)) self.model.learn(total_timesteps=int(1e10), callback=callback, tb_log_name="run") episode += 1 # Save trained model self.model.save("raspberry_agent")
def build_eval_callback( self, eval_freq=10000, reward_threshold=900, log_path=None, eval_episodes=10, eval_env=None, ): callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold, verbose=1) eval_callback = EvalCallback( eval_env=eval_env, best_model_save_path=log_path, log_path=log_path, eval_freq=eval_freq, deterministic=True, render=False, n_eval_episodes=eval_episodes, callback_on_new_best=callback_on_best, verbose=1, ) self.logger.debug( "Eval callback called every {} timesteps: stop training when mean reward is above {} in {} episodes".format( eval_freq, reward_threshold, eval_episodes ) ) return eval_callback
from stable_baselines import PPO2 from stable_baselines.common.policies import CnnPolicy from stable_baselines.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold from BeautifulBlueSquare.BlueGymEnv import simpleAvoidance # Separate evaluation env eval_env = simpleAvoidance() # Stop training when the model reaches the reward threshold, 800 * .9 = 720 callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=720, verbose=1) # Create call back that will eval model and save the best one and stop training once reward has reached 490 eval_callback = EvalCallback(eval_env, n_eval_episodes=20, eval_freq=int(800 * 50), callback_on_new_best=callback_on_best, best_model_save_path="model", log_path="model", verbose=1) # Almost infinite number of timesteps, but the training will stop # early as soon as the reward threshold is reached env = simpleAvoidance() model = PPO2(CnnPolicy, env, gamma=.99, n_steps=256) model.learn(total_timesteps=int(20e6), callback=eval_callback)
from stable_baselines import DQN from Config import Config from Result import Result # Inits Battleship gym environments and config config = Config(5, [3, 2, 2], True, False, False) env2 = gym.make('Battleships-v0', config=config) env3 = gym.make('Battleships-v0', config=config) env = DummyVecEnv([lambda: env2]) env4 = DummyVecEnv([lambda: env3]) check_env(env2, warn=True) # Define Callback #Callback stops training if maximum is reached in mean reward callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=env2.calculate_threshold(), verbose=1) # Callback safes the currently best model eval_callback = EvalCallback(env4, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path='./DQN_Models/best/') checkpoint_callback = CheckpointCallback(save_freq=1e4, save_path='./model_checkpoints/') # Uncomment, to train a new fresh model, otherwise a allready trained model will be trained #model = DQN(MlpPolicy, env, verbose=2, tensorboard_log="./logs/progress_tensorboard/") # Load current best model model = DQN.load("DQN_Models/dqn_5x5_3_SingleShot.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/") # Train model model.learn(total_timesteps=1000000, callback=[checkpoint_callback, eval_callback]) #Delete current model and load the best model del model