def train( model: BaseAlgorithm, timesteps: int, eval_env: GymEnv, model_path: Path ) -> None: """ Train agent moves in his environment. Learning will finish when agent performs given number of timesteps or when mean reward of 10 gameplays reachs value 1. :param model: RL agent :param timesteps: total number of steps to take (through all episodes) :param eval_env: evaluation environment :param model_path: location where model will be saved :param tb_log_name: the name of the run for tensorboard log """ mlflow_callback = MlflowCallback(model_path) reward_threshold_callback = StopTrainingOnRewardThreshold( reward_threshold=1 ) eval_callback = MlflowEvalCallback( eval_env=eval_env, callback_on_new_best=reward_threshold_callback ) callbacks = CallbackList([mlflow_callback, eval_callback]) model.learn(total_timesteps=timesteps, callback=callbacks)
def learn(self, model: BaseAlgorithm) -> None: """ :param model: an initialized RL model """ kwargs = {} if self.log_interval > -1: kwargs = {"log_interval": self.log_interval} if len(self.callbacks) > 0: kwargs["callback"] = self.callbacks try: model.learn(self.n_timesteps, **kwargs) except KeyboardInterrupt: # this allows to save the model when interrupting training pass finally: # Release resources try: model.env.close() except EOFError: pass
def evaluate(environment: RelativeExchange, model: BaseAlgorithm): total_pnl = 0 total_reward = 0 done = False state = environment.reset() while not done: action, _ = model.predict(state, deterministic=True) state, reward, done, info = environment.step(action) total_reward += reward total_pnl += info['pnl'] environment.clean_up() print(f'Pnl: {total_pnl} / Reward: {total_reward}')
def save_stable_model( output_dir: str, model: BaseAlgorithm, vec_normalize: Optional[VecNormalize] = None, ) -> None: """Serialize policy. Load later with `load_policy(..., policy_path=output_dir)`. Args: output_dir: Path to the save directory. policy: The stable baselines policy. vec_normalize: Optionally, a VecNormalize to save statistics for. `load_policy` automatically applies `NormalizePolicy` wrapper when loading. """ os.makedirs(output_dir, exist_ok=True) model.save(os.path.join(output_dir, "model.pkl")) if vec_normalize is not None: with open(os.path.join(output_dir, "vec_normalize.pkl"), "wb") as f: pickle.dump(vec_normalize, f) logging.info("Saved policy to %s", output_dir)
def evaluate(model: BaseAlgorithm, env: Env, n_episodes: int = 100): episodical_rewards = [] for _ in range(n_episodes): obs = env.reset() episodical_reward = 0 done = False while not done: action = model.predict(obs) obs, rewards, dones, _ = env.step(action[0]) episodical_reward += rewards[0] done = dones[0] episodical_rewards.append(episodical_reward) return mean(episodical_rewards), stdev(episodical_rewards)
def save_trained_model(self, model: BaseAlgorithm) -> None: """ Save trained model optionally with its replay buffer and ``VecNormalize`` statistics :param model: """ print(f"Saving to {self.save_path}") model.save(f"{self.save_path}/{self.env_id}") if hasattr(model, "save_replay_buffer") and self.save_replay_buffer: print("Saving replay buffer") model.save_replay_buffer(os.path.join(self.save_path, "replay_buffer.pkl")) if self.normalize: # Important: save the running average, for testing the agent we need that normalization model.get_vec_normalize_env().save(os.path.join(self.params_path, "vecnormalize.pkl"))
def evaluate( model: BaseAlgorithm, env: Union[VecEnv, gym.Env], number_of_episodes: int ) -> Tuple[np.ndarray, np.ndarray]: """Evaluate for a given number of episodes.""" rewards = [] episode_lengths = [] for _ in tqdm(list(range(number_of_episodes))): state = env.reset() reward_cum = 0 steps = 0 while True: actions = model.predict(state)[0] state, reward, done, _ = env.step(actions) reward_cum += reward steps += 1 if np.any(done): break rewards.append(reward_cum) episode_lengths.append(steps) return np.array(rewards), np.array(episode_lengths)