Example #1
0
def train(
    model: BaseAlgorithm, timesteps: int, eval_env: GymEnv, model_path: Path
) -> None:
    """
    Train agent moves in his environment. Learning will finish when agent performs given number of timesteps or when mean reward of 10 gameplays reachs value 1.
    :param model: RL agent
    :param timesteps: total number of steps to take (through all episodes)
    :param eval_env: evaluation environment
    :param model_path: location where model will be saved
    :param tb_log_name: the name of the run for tensorboard log
    """
    mlflow_callback = MlflowCallback(model_path)
    reward_threshold_callback = StopTrainingOnRewardThreshold(
        reward_threshold=1
    )
    eval_callback = MlflowEvalCallback(
        eval_env=eval_env, callback_on_new_best=reward_threshold_callback
    )
    callbacks = CallbackList([mlflow_callback, eval_callback])

    model.learn(total_timesteps=timesteps, callback=callbacks)
Example #2
0
    def learn(self, model: BaseAlgorithm) -> None:
        """
        :param model: an initialized RL model
        """
        kwargs = {}
        if self.log_interval > -1:
            kwargs = {"log_interval": self.log_interval}

        if len(self.callbacks) > 0:
            kwargs["callback"] = self.callbacks

        try:
            model.learn(self.n_timesteps, **kwargs)
        except KeyboardInterrupt:
            # this allows to save the model when interrupting training
            pass
        finally:
            # Release resources
            try:
                model.env.close()
            except EOFError:
                pass
Example #3
0
def evaluate(environment: RelativeExchange, model: BaseAlgorithm):
    total_pnl = 0
    total_reward = 0
    done = False
    state = environment.reset()
    while not done:
        action, _ = model.predict(state, deterministic=True)
        state, reward, done, info = environment.step(action)
        total_reward += reward
        total_pnl += info['pnl']

    environment.clean_up()
    print(f'Pnl: {total_pnl} / Reward: {total_reward}')
Example #4
0
def save_stable_model(
    output_dir: str,
    model: BaseAlgorithm,
    vec_normalize: Optional[VecNormalize] = None,
) -> None:
    """Serialize policy.

    Load later with `load_policy(..., policy_path=output_dir)`.

    Args:
        output_dir: Path to the save directory.
        policy: The stable baselines policy.
        vec_normalize: Optionally, a VecNormalize to save statistics for.
            `load_policy` automatically applies `NormalizePolicy` wrapper
            when loading.
    """
    os.makedirs(output_dir, exist_ok=True)
    model.save(os.path.join(output_dir, "model.pkl"))
    if vec_normalize is not None:
        with open(os.path.join(output_dir, "vec_normalize.pkl"), "wb") as f:
            pickle.dump(vec_normalize, f)
    logging.info("Saved policy to %s", output_dir)
Example #5
0
def evaluate(model: BaseAlgorithm, env: Env, n_episodes: int = 100):
    episodical_rewards = []

    for _ in range(n_episodes):
        obs = env.reset()
        episodical_reward = 0
        done = False

        while not done:
            action = model.predict(obs)
            obs, rewards, dones, _ = env.step(action[0])
            episodical_reward += rewards[0]
            done = dones[0]
        episodical_rewards.append(episodical_reward)

    return mean(episodical_rewards), stdev(episodical_rewards)
Example #6
0
    def save_trained_model(self, model: BaseAlgorithm) -> None:
        """
        Save trained model optionally with its replay buffer
        and ``VecNormalize`` statistics

        :param model:
        """
        print(f"Saving to {self.save_path}")
        model.save(f"{self.save_path}/{self.env_id}")

        if hasattr(model, "save_replay_buffer") and self.save_replay_buffer:
            print("Saving replay buffer")
            model.save_replay_buffer(os.path.join(self.save_path, "replay_buffer.pkl"))

        if self.normalize:
            # Important: save the running average, for testing the agent we need that normalization
            model.get_vec_normalize_env().save(os.path.join(self.params_path, "vecnormalize.pkl"))
def evaluate(
    model: BaseAlgorithm,
    env: Union[VecEnv, gym.Env],
    number_of_episodes: int
) -> Tuple[np.ndarray, np.ndarray]:
    """Evaluate for a given number of episodes."""
    rewards = []
    episode_lengths = []
    for _ in tqdm(list(range(number_of_episodes))):
        state = env.reset()
        reward_cum = 0
        steps = 0
        while True:
            actions = model.predict(state)[0]
            state, reward, done, _ = env.step(actions)
            reward_cum += reward
            steps += 1
            if np.any(done):
                break
        rewards.append(reward_cum)
        episode_lengths.append(steps)

    return np.array(rewards), np.array(episode_lengths)