def fit(self,
            env,
            episodes,
            verbose,
            episode_steps,
            callbacks,
            log_interval,
            agent_id=-1):
        """Mask the agent fit function
        To train the agent
        """
        logger.info("herer")
        # self.model.learn(total_timesteps=100, log_interval=10)
        #FIXME: use the tb logname meaningful!

        #TODO: Write callback funcs here:
        # List of callback:
        # Checkpoint Callback: save the model every 10 episodes.
        checkpoint_callback = CheckpointCallback(
            save_freq=96,
            save_path=self.agent_helper.config_dir,
            name_prefix='rl_model')
        # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path.
        eval_env = env
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/',
                                     eval_freq=500,
                                     deterministic=True,
                                     render=False)
        # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70,
                                                         verbose=1)
        eval_callback_reward_threshold = EvalCallback(
            eval_env, callback_on_new_best=callback_on_best, verbose=1)
        # EveryNTimeSteps: to call every n time steps to save the model.
        checkpoint_on_event = CheckpointCallback(save_freq=1,
                                                 save_path='./logs/')
        event_callback_after_n_steps = EveryNTimesteps(
            n_steps=500, callback=checkpoint_on_event)

        # StopTrainingOnMaxEpisodes:
        # Stops training when the model reaches the maximum number of episodes
        callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5,
                                                          verbose=1)

        # CallbackList: to call several callback together.
        callbacklist = CallbackList([checkpoint_callback, eval_callback])

        logger.info(f"Model: {self.model.get_env()}")
        with ProgressBarManager(log_interval) as progress_callback:
            self.model.learn(total_timesteps=log_interval,
                             callback=[progress_callback, checkpoint_callback])
        # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        # self.eval_writer(mean_reward, std_reward)
        pass
def runner(agent, episode, checkpoint, env):
    # scores = np.genfromtxt(checkpoint+'/data.csv', delimiter=',')
    # checkpoint2 = checkpoint+'2'
    custom_callback = LoggerCallback(episode, checkpoint=checkpoint)
    checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=checkpoint,
                                            name_prefix='rl_model')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=episode, verbose=1)
    event_callback = EveryNTimesteps(n_steps=1, callback=custom_callback)
    # load = os.path.abspath(checkpoint+'/rl_model_676000_steps')
    # print(load)
    # agent = DDPG.load(load, env)
    callback_list = CallbackList([event_callback, checkpoint_callback, callback_max_episodes])
    # agent.learn(total_timesteps=100000000, callback=callback_list, reward_function=reward)
    agent.learn(total_timesteps=100000000, callback=callback_list)
    scores = custom_callback.rewards
    np.savetxt(checkpoint+'/data.csv', scores, delimiter=',')

    return scores
Beispiel #3
0
    def learn(self, initial_models):
        mesa_algo = TD3(
            "MlpPolicy", self.env, verbose=1, learning_starts=1
        )  # Note: Unecessarily initializes parameters (could speed up a bit by fixing)'

        mesa_algo.set_parameters(to_torch(initial_models), exact_match=False)
        LOG_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/logs/"
        MODEL_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/models/"

        callback_list = []
        callback_list.append(TensorboardCallback())
        callback_list.append(
            StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1))
        """callback_list.append(EvalCallback(self.env, best_model_save_path=MODEL_DIR, log_path=LOG_DIR,
                                    deterministic=True,
                                    eval_freq=5,
                                    n_eval_episodes=1))"""
        mesa_algo.learn(total_timesteps=1000, callback=callback_list
                        )  #rospy.get_param("/hyperparameters/total_timesteps")

        print("finished training! Testing mesa network...")
        test_buffer = ReplayBuffer(100,
                                   TaskEnv.observation_space,
                                   TaskEnv.action_space,
                                   device="cuda")

        test_env = Monitor(self.env)
        done = False
        ob = test_env.reset()
        while not done:
            action, state = mesa_algo.predict(ob)
            next_ob, reward, done, info = test_env.step(action)
            test_buffer.add(ob, next_ob, action, reward, done, [info])
            ob = next_ob

        meta_buffer = {"test": test_buffer, "train": mesa_algo.replay_buffer}

        optimized_mesa_parameters = mesa_algo.get_parameters()
        tf_mesa_models = from_torch(optimized_mesa_parameters)

        return meta_buffer, tf_mesa_models
Beispiel #4
0
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy",
                        env_name,
                        policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(
        eval_env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=log_folder,
        log_path=log_folder,
        eval_freq=100,
        warn=False,
    )
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=log_folder,
                                             name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100,
                                                      verbose=1)

    callback = CallbackList([
        checkpoint_callback, eval_callback, event_callback,
        callback_max_episodes
    ])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy",
                            envs,
                            policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(
            max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)



reward_threshold = { "MountainCar-v0":-100,
                     "CartPole-v1": 500,
                    "Acrobot-v1":-67, "BipedalWalker-v3":30,
                    "LunarLander-v2":200, 'Pendulum-v0':-100}

model_def = [64,64]

for task in reward_threshold.keys():
    TASK_NAME = task
    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
                                             name_prefix='rl_model')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100 * 150 /2, verbose=1)
    env = gym.make(TASK_NAME)

    log_dir = "./logs"

    env_m = monitor.Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env_m])
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    # Stop training when the model reaches the reward threshold
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold[TASK_NAME], verbose=1)
    eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1)
    callback = CallbackList([callback_max_episodes, eval_callback])
   
    model = A2C('MlpPolicy', env, verbose=1,policy_kwargs=dict(net_arch=model_def))
    st = time.time()
    model.learn(total_timesteps=100 * 150 * 10000, callback=callback)