def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # Dyn only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100 ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) callback = CallbackList([checkpoint_callback, eval_callback, event_callback]) model.learn(500, callback=callback) model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) if os.path.exists(log_folder): shutil.rmtree(log_folder)
def fit(self, env, episodes, verbose, episode_steps, callbacks, log_interval, agent_id=-1): """Mask the agent fit function To train the agent """ logger.info("herer") # self.model.learn(total_timesteps=100, log_interval=10) #FIXME: use the tb logname meaningful! #TODO: Write callback funcs here: # List of callback: # Checkpoint Callback: save the model every 10 episodes. checkpoint_callback = CheckpointCallback( save_freq=96, save_path=self.agent_helper.config_dir, name_prefix='rl_model') # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path. eval_env = env eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500, deterministic=True, render=False) # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70, verbose=1) eval_callback_reward_threshold = EvalCallback( eval_env, callback_on_new_best=callback_on_best, verbose=1) # EveryNTimeSteps: to call every n time steps to save the model. checkpoint_on_event = CheckpointCallback(save_freq=1, save_path='./logs/') event_callback_after_n_steps = EveryNTimesteps( n_steps=500, callback=checkpoint_on_event) # StopTrainingOnMaxEpisodes: # Stops training when the model reaches the maximum number of episodes callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1) # CallbackList: to call several callback together. callbacklist = CallbackList([checkpoint_callback, eval_callback]) logger.info(f"Model: {self.model.get_env()}") with ProgressBarManager(log_interval) as progress_callback: self.model.learn(total_timesteps=log_interval, callback=[progress_callback, checkpoint_callback]) # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10) # self.eval_writer(mean_reward, std_reward) pass
def main(): # get init time and use it for save path now = datetime.now() save_path = './trained/' + now.strftime("%B %d, %Y - %H.%M") os.mkdir(save_path) # using sound library for pure fun engine = pyttsx3.init() # object creation engine.setProperty('rate', 150) # setting up new voice rate with open('config.yml') as file: configurations = yaml.safe_load(file) configurations['general']['flightgear'] = 'false' configurations['general']['agent_interaction_freq'] = 5 with open('config.yml', 'w') as file: yaml.dump(configurations, file) env_make = make_vec_env(configurations['general']['env'], n_envs=1, seed=0) env = VecNormalize(env_make, norm_obs=True, norm_reward=True, clip_obs=10.) # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=300, verbose=1) eval_callback = EvalCallback( env, callback_on_new_best=callback_on_best, best_model_save_path=save_path, eval_freq=configurations['train']['timesteps'] / 100, deterministic=True) with open(save_path + '/env.pkl', "wb") as file_handler: pickle.dump(env, file_handler, pickle.HIGHEST_PROTOCOL) if configurations['train']['model'] == "none": print("--> Alican's LOG: A new model will be created for training") model = Agents.create_model(env, configurations['general']['algorithm'], save_path) else: print( "--> Alican's LOG: An already existed model will be used for training" ) model = Agents.load_model( env, configurations['general']['algorithm'], configurations['train']['model'] + '/best_model') model.learn(total_timesteps=configurations['train']['timesteps'], callback=eval_callback, log_interval=20) engine.say("Training is finished!") engine.runAndWait() engine.stop()
def train( model: BaseAlgorithm, timesteps: int, eval_env: GymEnv, model_path: Path ) -> None: """ Train agent moves in his environment. Learning will finish when agent performs given number of timesteps or when mean reward of 10 gameplays reachs value 1. :param model: RL agent :param timesteps: total number of steps to take (through all episodes) :param eval_env: evaluation environment :param model_path: location where model will be saved :param tb_log_name: the name of the run for tensorboard log """ mlflow_callback = MlflowCallback(model_path) reward_threshold_callback = StopTrainingOnRewardThreshold( reward_threshold=1 ) eval_callback = MlflowEvalCallback( eval_env=eval_env, callback_on_new_best=reward_threshold_callback ) callbacks = CallbackList([mlflow_callback, eval_callback]) model.learn(total_timesteps=timesteps, callback=callbacks)
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # DQN only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100, warn=False, ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) # Stop training if max number of episodes is reached callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1) callback = CallbackList([ checkpoint_callback, eval_callback, event_callback, callback_max_episodes ]) model.learn(500, callback=callback) # Check access to local variables assert model.env.observation_space.contains(callback.locals["new_obs"][0]) # Check that the child callback was called assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"] assert event_callback.locals["new_obs"] is callback.locals["new_obs"] assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"] # Check that internal callback counters match models' counters assert event_callback.num_timesteps == model.num_timesteps assert event_callback.n_calls == model.num_timesteps model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) # Testing models that support multiple envs if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 # Pendulum-v0 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32])) callback_max_episodes = StopTrainingOnMaxEpisodes( max_episodes=max_episodes, verbose=1) callback = CallbackList([callback_max_episodes]) model.learn(1000, callback=callback) # Check that the actual number of episodes and timesteps per env matches the expected one episodes_per_env = callback_max_episodes.n_episodes // n_envs assert episodes_per_env == max_episodes timesteps_per_env = model.num_timesteps // n_envs assert timesteps_per_env == max_episode_length if os.path.exists(log_folder): shutil.rmtree(log_folder)
seed=0) if env_nam == "hover-aviary-v0": eval_env = make_vec_env(HoverAviary, env_kwargs=sa_env_kwargs, n_envs=1, seed=0) if env_name == "flythrugate-aviary-v0": eval_env = make_vec_env(FlyThruGateAviary, env_kwargs=sa_env_kwargs, n_envs=1, seed=0) eval_env = VecTransposeImage(eval_env) #### Train the model ####################################### # checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=filename+'-logs/', name_prefix='rl_model') callback_on_best = StopTrainingOnRewardThreshold( reward_threshold=EPISODE_REWARD_THRESHOLD, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path=filename + '/', log_path=filename + '/', eval_freq=int(5000 / ARGS.cpu), deterministic=True, render=False) model.learn(total_timesteps=int(1e12), callback=eval_callback, log_interval=100) ### Save the model ######################################### model.save(filename + '/success_model.zip') # Possibly never achieved print(filename)
def main(): set_random_seed(RANDOM_SEED) t_start = time() name = "LargeFinalLayer" checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args) def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=None, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList([save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=DEVICE_TYPE, policy_kwargs=dict(features_extractor_class=FeatureExtractor), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") # model.learn(TRAIN_STEPS, tb_log_name="run") model.save(zip_path) del train_env # del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken)))
model_def = [64,64] for task in reward_threshold.keys(): TASK_NAME = task checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='rl_model') callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100 * 150 /2, verbose=1) env = gym.make(TASK_NAME) log_dir = "./logs" env_m = monitor.Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env_m]) env = VecNormalize(env, norm_obs=True, norm_reward=True) # Stop training when the model reaches the reward threshold callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold[TASK_NAME], verbose=1) eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1) callback = CallbackList([callback_max_episodes, eval_callback]) model = A2C('MlpPolicy', env, verbose=1,policy_kwargs=dict(net_arch=model_def)) st = time.time() model.learn(total_timesteps=100 * 150 * 10000, callback=callback) elapse_time = time.time() - st with open("./outdir/"+TASK_NAME + ".plt", "wb") as fd: chkpt = { "elapse_time": elapse_time, "reward_threshold" : reward_threshold, "reward_list" : env_m.get_episode_rewards(), "timestep_list": env_m.get_episode_lengths(), "runtime_list" : env_m.get_episode_times(),
def evaluate(individual: Individual, device: Union[torch.device, str] = "auto") -> Tuple[int]: """ Evaluate a single individual model and return it's mean score after the training time is elapsed. Models are trained and evaluated for a number of timestamps as parameterized in the constants at the top of the file. :param individual: The individual to evaluate. :return: """ t_start = time() layers = individual.weights name = individual.encode() checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name) if os.path.exists(checkpoint_path): return (random.randint(MIN_SCORE, MAX_SCORE), ) os.makedirs(checkpoint_path, exist_ok=True) log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name) os.makedirs(log_path, exist_ok=True) results_path = os.path.join(checkpoint_path, "results.json") if not os.path.exists(results_path): env_args = dict( frame_skip=4, screen_size=84, terminal_on_life_loss=True, clip_reward=True, ) # Creates a gym environment for an atari game using the specified seed and number of environments # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors # for improved performance.. def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **env_args) return env def make_env(rank: int, count: int) -> VecEnv: return make_vec_env( ENV_NAME, n_envs=count, seed=RANDOM_SEED + rank, start_index=0, monitor_dir=None, wrapper_class=atari_wrapper, env_kwargs=None, vec_env_cls=SubprocVecEnv, vec_env_kwargs=None, monitor_kwargs=None, ) train_env = make_env(0, N_ENVS) eval_env = make_env(1, 1) # required by models in baselines train_env = VecTransposeImage(train_env) eval_env = VecTransposeImage(eval_env) # setup callback to save model at fixed intervals save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=checkpoint_path, name_prefix=name) stop_callback = StopTrainingOnRewardThreshold( reward_threshold=EVAL_THRESHOLD) time_callback = TimeLimitCallback(max_time=TIME_LIMIT) best_callback = EvalCallback( eval_env, eval_freq=EVAL_FREQ, best_model_save_path=checkpoint_path, callback_on_new_best=stop_callback, ) list_callback = CallbackList( [save_callback, best_callback, time_callback]) model = PPO( CnnPolicy, train_env, verbose=VERBOSE, batch_size=BATCH_SIZE, seed=RANDOM_SEED * 7, tensorboard_log=log_path, learning_rate=LEARNING_RATE, n_steps=UPDATE_STEPS, n_epochs=N_EPOCHS, ent_coef=ENT_COEF, vf_coef=VF_COEF, clip_range=CLIP_RANGE, device=device, policy_kwargs=dict(features_extractor_class=VariableBenchmark, features_extractor_kwargs=dict(layers=layers)), ) config_path = os.path.join(checkpoint_path, "cnn_config") zip_path = os.path.join(checkpoint_path, "model.zip") # output the model config to a file for easier viewing with open(config_path, "w") as file: file.write(f"{name}\n") file.write(str(model.policy.features_extractor.cnn)) print("Beginning training...") model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run") model.save(zip_path) del train_env del eval_env time_taken = time() - t_start print("Beginning evaluation...") # score of the game, standard deviation of multiple runs reward_mean, reward_std = evaluate_policy(model, make_env(2, 1)) with open(results_path, "w") as handle: handle.write(json.dumps((reward_mean, reward_std, time_taken))) else: reward_mean, reward_std, time_taken = json.load(open( results_path, "r")) reward_mean = abs(MIN_SCORE) + reward_mean value = (reward_mean * weighted_time(time_taken), ) print(f"Evaluated {name} with a score of {value} in {(time_taken):.2f}s") return value