def _init_callback(
        self,
        callback: MaybeCallback,
        eval_env: Optional[VecEnv] = None,
        eval_freq: int = 10000,
        n_eval_episodes: int = 5,
        log_path: Optional[str] = None,
    ) -> BaseCallback:
        """
        :param callback: Callback(s) called at every step with state of the algorithm.
        :param eval_freq: How many steps between evaluations; if None, do not evaluate.
        :param n_eval_episodes: How many episodes to play per evaluation
        :param n_eval_episodes: Number of episodes to rollout during evaluation.
        :param log_path: Path to a folder where the evaluations will be saved
        :return: A hybrid callback calling `callback` and performing evaluation.
        """
        # Convert a list of callbacks into a callback
        if isinstance(callback, list):
            callback = CallbackList(callback)

        # Convert functional callback to object
        if not isinstance(callback, BaseCallback):
            callback = ConvertCallback(callback)

        # Create eval callback in charge of the evaluation
        if eval_env is not None:
            eval_callback = EvalCallback(eval_env,
                                         best_model_save_path=log_path,
                                         log_path=log_path,
                                         eval_freq=eval_freq,
                                         n_eval_episodes=n_eval_episodes)
            callback = CallbackList([callback, eval_callback])

        callback.init_callback(self)
        return callback
Exemple #2
0
    def _init_callback(self,
                       callback: Union[None, Callable, List[BaseCallback], BaseCallback],
                       eval_env: Optional[VecEnv] = None,
                       eval_freq: int = 10000,
                       n_eval_episodes: int = 5,
                       log_path: Optional[str] = None) -> BaseCallback:
        """
        :param callback: (Union[callable, [BaseCallback], BaseCallback, None])
        :return: (BaseCallback)
        """
        # Convert a list of callbacks into a callback
        if isinstance(callback, list):
            callback = CallbackList(callback)

        # Convert functional callback to object
        if not isinstance(callback, BaseCallback):
            callback = ConvertCallback(callback)

        # Create eval callback in charge of the evaluation
        if eval_env is not None:
            eval_callback = EvalCallback(eval_env,
                                         best_model_save_path=log_path,
                                         log_path=log_path, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes)
            callback = CallbackList([callback, eval_callback])

        callback.init_callback(self)
        return callback
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # Dyn only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1)

    eval_callback = EvalCallback(
        eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100
    )

    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event")
    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    callback = CallbackList([checkpoint_callback, eval_callback, event_callback])

    model.learn(500, callback=callback)
    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)
    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
    def fit(self,
            env,
            episodes,
            verbose,
            episode_steps,
            callbacks,
            log_interval,
            agent_id=-1):
        """Mask the agent fit function
        To train the agent
        """
        logger.info("herer")
        # self.model.learn(total_timesteps=100, log_interval=10)
        #FIXME: use the tb logname meaningful!

        #TODO: Write callback funcs here:
        # List of callback:
        # Checkpoint Callback: save the model every 10 episodes.
        checkpoint_callback = CheckpointCallback(
            save_freq=96,
            save_path=self.agent_helper.config_dir,
            name_prefix='rl_model')
        # Eval Callback: evaluate every eval_freq, save the best model to best_model_save_path.
        eval_env = env
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/',
                                     eval_freq=500,
                                     deterministic=True,
                                     render=False)
        # StopTrainingOnRewardThreshold: stop the training on reward threshold, show that this is good enough
        callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=70,
                                                         verbose=1)
        eval_callback_reward_threshold = EvalCallback(
            eval_env, callback_on_new_best=callback_on_best, verbose=1)
        # EveryNTimeSteps: to call every n time steps to save the model.
        checkpoint_on_event = CheckpointCallback(save_freq=1,
                                                 save_path='./logs/')
        event_callback_after_n_steps = EveryNTimesteps(
            n_steps=500, callback=checkpoint_on_event)

        # StopTrainingOnMaxEpisodes:
        # Stops training when the model reaches the maximum number of episodes
        callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=5,
                                                          verbose=1)

        # CallbackList: to call several callback together.
        callbacklist = CallbackList([checkpoint_callback, eval_callback])

        logger.info(f"Model: {self.model.get_env()}")
        with ProgressBarManager(log_interval) as progress_callback:
            self.model.learn(total_timesteps=log_interval,
                             callback=[progress_callback, checkpoint_callback])
        # mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        # self.eval_writer(mean_reward, std_reward)
        pass
Exemple #6
0
 def lean(
     self,
     callback: MaybeCallback = None,
     log_interval: int = 4,
     eval_env: Optional[GymEnv] = None,
     eval_freq: int = -1,
     n_eval_episodes: int = 5,
     tb_log_name: str = "run",
     eval_log_path: Optional[str] = None,
     reset_num_timesteps: bool = True,
 ):
     callback = CallbackList([self.checkpoint_cb, callback])
     self.model.learn(total_timesteps=self.args.time_steps,
                      log_interval=self.config.sac_log_interval(),
                      tb_log_name="racer_learnig_log",
                      callback=callback)
     return self.model
def runner(agent, episode, checkpoint, env):
    # scores = np.genfromtxt(checkpoint+'/data.csv', delimiter=',')
    # checkpoint2 = checkpoint+'2'
    custom_callback = LoggerCallback(episode, checkpoint=checkpoint)
    checkpoint_callback = CheckpointCallback(save_freq=100000, save_path=checkpoint,
                                            name_prefix='rl_model')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=episode, verbose=1)
    event_callback = EveryNTimesteps(n_steps=1, callback=custom_callback)
    # load = os.path.abspath(checkpoint+'/rl_model_676000_steps')
    # print(load)
    # agent = DDPG.load(load, env)
    callback_list = CallbackList([event_callback, checkpoint_callback, callback_max_episodes])
    # agent.learn(total_timesteps=100000000, callback=callback_list, reward_function=reward)
    agent.learn(total_timesteps=100000000, callback=callback_list)
    scores = custom_callback.rewards
    np.savetxt(checkpoint+'/data.csv', scores, delimiter=',')

    return scores
Exemple #8
0
    def __init__(self,
                 path,
                 env_cls,
                 env_kwargs,
                 agent_kwargs,
                 steps_per_rollout,
                 num_envs,
                 callbacks=[]):
        self.folder = ExperimentFolder(path)
        self.agent, self.env = self.folder.get(env_cls, env_kwargs,
                                               agent_kwargs)
        self.steps_per_rollout = steps_per_rollout
        self.num_envs = num_envs

        store = lambda _: self.folder.store(self.agent, env_kwargs,
                                            agent_kwargs)
        self.get_callback = lambda save_freq: CallbackList(callbacks + [
            EveryNRolloutsPlusStartFinishFunctionCallback(save_freq, store)
        ])
Exemple #9
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])
    #
    if args.stats_path is None:
        envs = VecNormalize(envs)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs)
        learner.learn(total_timesteps=10000000, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)
        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Exemple #10
0
 def lean(
     self,
     callback: MaybeCallback = None,
     log_interval: int = 4,
     eval_env: Optional[GymEnv] = None,
     eval_freq: int = -1,
     n_eval_episodes: int = 5,
     tb_log_name: str = "run",
     eval_log_path: Optional[str] = None,
     reset_num_timesteps: bool = True,
 ):
     # NOTE Avoid NoneType object callback for Simulation. This is problem of subcommand.py .
     callback = CallbackList(
         [c for c in [self.checkpoint_cb, callback] if c is not None])
     self.model.learn(total_timesteps=self.args.time_steps,
                      log_interval=self.config.sac_log_interval(),
                      tb_log_name="racer_learnig_log",
                      callback=callback)
     return self.model
Exemple #11
0
def train(
    model: BaseAlgorithm, timesteps: int, eval_env: GymEnv, model_path: Path
) -> None:
    """
    Train agent moves in his environment. Learning will finish when agent performs given number of timesteps or when mean reward of 10 gameplays reachs value 1.
    :param model: RL agent
    :param timesteps: total number of steps to take (through all episodes)
    :param eval_env: evaluation environment
    :param model_path: location where model will be saved
    :param tb_log_name: the name of the run for tensorboard log
    """
    mlflow_callback = MlflowCallback(model_path)
    reward_threshold_callback = StopTrainingOnRewardThreshold(
        reward_threshold=1
    )
    eval_callback = MlflowEvalCallback(
        eval_env=eval_env, callback_on_new_best=reward_threshold_callback
    )
    callbacks = CallbackList([mlflow_callback, eval_callback])

    model.learn(total_timesteps=timesteps, callback=callbacks)
Exemple #12
0
def setup_train(config, setup_dirs=True):
    T.set_num_threads(1)
    if setup_dirs:
        for s in ["agents", "agents_cp", "tb"]:
            if not os.path.exists(s):
                os.makedirs(s)

    # Random ID of this session
    if config["default_session_ID"] is None:
        config["session_ID"] = ''.join(
            random.choices('ABCDEFGHJKLMNPQRSTUVWXYZ', k=3))
    else:
        config["session_ID"] = config["default_session_ID"]

    stats_path = "agents/{}_vecnorm.pkl".format(config["session_ID"])

    # Import correct env by name
    env_fun = my_utils.import_env(config["env_name"])
    env = env_fun(config)
    model = make_model(config, env)

    checkpoint_callback = CheckpointCallback(save_freq=100000,
                                             save_path='agents_cp/',
                                             name_prefix=config["session_ID"],
                                             verbose=1)

    # Separate evaluation env
    config_eval = deepcopy(config)
    config_eval["animate"] = False
    eval_env = env_fun(config_eval)
    # Use deterministic actions for evaluation
    eval_callback = EvalCallback(eval_env,
                                 eval_freq=10000,
                                 deterministic=True,
                                 render=False)
    callback_list = CallbackList([checkpoint_callback, eval_callback])

    return env, model, callback_list, stats_path
Exemple #13
0
def train_alg(model_alg, reset_optimizers_between_envs,
              reset_optimizers_every_iter, buffer_size, subsave, iteration,
              last_round_no_mer, is_evolving, seed):
    seed_all(seed)
    training_timesteps = META_TRAINING_TIMESTEPS
    params = params_list
    if not is_evolving:
        params = [params[-1]]

    start_time = time()
    env = gym.make(env_name)
    eval_env = gym.make(env_name)
    final_eval_env = gym.make(env_name)
    final_parameters_dict = params_sampler.sample1_means()
    change_env_parameters(final_eval_env, parameter_dict=final_parameters_dict)
    tensorboard_path = subsave + '/tb_' + str(iteration)

    optimizer_kwargs = {}
    policy_kwargs = {
        'optimizer_class': th.optim.Adam,
        'optimizer_kwargs': optimizer_kwargs,
    }
    model = model_alg(
        MlpPolicy,
        env,
        verbose=0,
        buffer_size=buffer_size,
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        learning_starts=LEARNING_STARTS,
        gradient_steps=GRADIENT_STEPS,
        policy_kwargs=policy_kwargs,
        mer_s=MER_S,
        mer_gamma=MER_GAMMA,
        monitor_wrapper=True,
        tensorboard_log=tensorboard_path,
        reset_optimizers_during_training=reset_optimizers_every_iter,
        seed=seed)

    for i_param, param in enumerate(params):
        log_name = 'run_' + str(i_param)
        if i_param == (len(params) - 1):
            if not is_evolving:
                training_timesteps = FINAL_TRAINING_TIMESTEPS + NUM_TRAINING_ENVS * META_TRAINING_TIMESTEPS
            else:
                training_timesteps = FINAL_TRAINING_TIMESTEPS
            log_name += '_final'
        change_env_parameters(env, eval_env, parameter_dict=param)
        if model_alg.__name__ == 'SACMER' and last_round_no_mer and (
                i_param == (len(params) - 1)):
            is_reservoir = False
            is_mer = False
        else:  # This will not have any effect on regular SAC
            is_reservoir = True
            is_mer = True
        model.update_env(env,
                         monitor_wrapper=False,
                         is_reservoir=is_reservoir,
                         reset_optimizers=reset_optimizers_between_envs
                         )  # environment already wrapped so
        # monitor_wrapper=False
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path=None,
                                     log_path=tensorboard_path + '/' +
                                     log_name + '/running_eval',
                                     eval_freq=EVAL_FREQ,
                                     n_eval_episodes=N_EVAL_EPISODES,
                                     deterministic=True,
                                     render=False)
        if is_evolving:
            final_eval_callback = EvalCallback(final_eval_env,
                                               best_model_save_path=None,
                                               log_path=tensorboard_path +
                                               '/' + log_name + '/final_eval',
                                               eval_freq=EVAL_FREQ,
                                               n_eval_episodes=N_EVAL_EPISODES,
                                               deterministic=True,
                                               render=False)
        else:
            final_eval_callback = EventCallback()
        model.learn(total_timesteps=training_timesteps,
                    log_interval=1,
                    reset_num_timesteps=False,
                    tb_log_name=log_name,
                    is_mer=is_mer,
                    callback=CallbackList([eval_callback,
                                           final_eval_callback]))
        env.reset()
        eval_env.reset()
    if iteration == 0:  # saving models fills up storage, so we only save one (which we will also probably not use)
        model.save(subsave + 'model_' + str(iteration))
    print(f"Done. Total time = {time() - start_time} seconds.")
Exemple #14
0
            eval_freq=n_timesteps_episode * args.eval_freq,
            deterministic=True,
            render=False,
            n_eval_episodes=args.eval_length)
        callbacks.append(eval_callback)

    # Set up tensorboard logger
    if args.tensorboard:
        log_callback = LoggerCallback(sinergym_logger=bool(args.logger))
        callbacks.append(log_callback)
        # lets change default dir for TensorboardFormatLogger only
        tb_path = args.tensorboard + '/' + name
        new_logger = configure(tb_path, ["tensorboard"])
        model.set_logger(new_logger)

    callback = CallbackList(callbacks)

    # ---------------------------------------------------------------------------- #
    #                                   TRAINING                                   #
    # ---------------------------------------------------------------------------- #
    model.learn(total_timesteps=timesteps,
                callback=callback,
                log_interval=args.log_interval)
    model.save(env.simulator._env_working_dir_parent + '/' + name)

    # If the algorithm doesn't reset or close the environment, this script will do it in
    # order to correctly log all the simulation data (Energyplus + Sinergym
    # logs)
    if env.simulator._episode_existed:
        env.close()
Exemple #15
0
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy",
                        env_name,
                        policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000,
                                             save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200,
                                                     verbose=1)

    eval_callback = EvalCallback(
        eval_env,
        callback_on_new_best=callback_on_best,
        best_model_save_path=log_folder,
        log_path=log_folder,
        eval_freq=100,
        warn=False,
    )
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1,
                                             save_path=log_folder,
                                             name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100,
                                                      verbose=1)

    callback = CallbackList([
        checkpoint_callback, eval_callback, event_callback,
        callback_max_episodes
    ])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy",
                            envs,
                            policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(
            max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
Exemple #16
0
                                         save_path=logger.output_dir,
                                         name_prefix='rl_model')

savestats_callback = SaveNormalization(save_path=osp.join(
    logger.output_dir,
    "vec_normalization.pkl"))  # If using normalize, must create this callback

eval_callback = EvalCallback(eval_env=eval_env,
                             n_eval_episodes=5,
                             callback_on_new_best=savestats_callback,
                             eval_freq=1000,
                             best_model_save_path=osp.join(
                                 logger.output_dir, "best_model"),
                             log_path=osp.join(logger.output_dir, "results"))

callback = CallbackList([checkpoint_callback, eval_callback])

if custom_params['algo'] == 'sac':
    model = SAC(policy=custom_params['policy'],
                env=env,
                verbose=1,
                **custom_params['sac_parameters'],
                tensorboard_log=logger.output_dir)
elif custom_params['algo'] == 'dqn':
    model = DQN(policy=custom_params['policy'],
                env=env,
                verbose=1,
                **custom_params['dqn_parameters'],
                tensorboard_log=logger.output_dir)
elif custom_params['algo'] == 'a2c':
    model = A2C(policy=custom_params['policy'],
Exemple #17
0
def run(config):
    log.info(f'Beginning run for experiment {config["EXPERIMENT_ID"]}')

    # TODO: clean up
    RESULTS_PATH = config['RESULTS_PATH']
    EXPERIMENTS_PREFIX = f'{RESULTS_PATH}{config["EXPERIMENT_ID"]}{os.sep}'
    ARTIFACTS_PATH = f'{EXPERIMENTS_PREFIX}artifact{os.sep}'
    VIS_RESULTS_PATH = f'{EXPERIMENTS_PREFIX}vis{os.sep}'
    SAVE_GIF_PATH = f'{EXPERIMENTS_PREFIX}gif{os.sep}'
    WANN_OUT_PREFIX = f'{ARTIFACTS_PATH}wann{os.sep}'
    ALG_OUT_PREFIX = f'{ARTIFACTS_PATH}alg{os.sep}'
    NUM_WORKERS = config['NUM_WORKERS']
    GAME_CONFIG = config['GAME_CONFIG']
    AGENT_CONFIG = config['AGENT']

    log.info('RUN CONFIG:')
    log.info(config)

    log.info('Experiment description:')
    log.info(config['DESCRIPTION'])

    paths = [
        ARTIFACTS_PATH, VIS_RESULTS_PATH, SAVE_GIF_PATH, WANN_OUT_PREFIX,
        f'{ALG_OUT_PREFIX}checkpoint{os.sep}checkpoint-alg{os.sep}'
    ]
    for p in paths:
        if not os.path.isdir(p):
            os.makedirs(p)

    ENV_NAME = GAME_CONFIG.env_name

    games = {ENV_NAME: GAME_CONFIG}

    wtrain.init_games_config(games)

    if config['TRAIN_WANN']:
        if "parent" == mpi_fork(NUM_WORKERS + 1): os._exit(0)

    wann_param_config = config['WANN_PARAM_CONFIG']
    wann_args = dict(hyperparam=wann_param_config,
                     outPrefix=WANN_OUT_PREFIX,
                     rank=rank,
                     num_workers=NUM_WORKERS,
                     games=games)

    device = config['DEVICE']

    alg = None
    use_wann = None
    for i in range(1, config['NUM_EPOCHS'] + 1):
        if config['TRAIN_WANN']:
            wtrain.set_device(device)
            wtrain.run(
                wann_args,
                use_checkpoint=True
                if i > 1 or config['USE_PREV_EXPERIMENT'] else False,
                alg_critic=None if alg is None else alg.critic,
                alg_policy=None if alg is None else alg.policy,
                mem=None if alg is None else alg.replay_buffer,
                wann_batch_size=AGENT_CONFIG['wann_batch_size'],
                wann_bootstrap_default=AGENT_CONFIG['wann_bootstrap_default'])

        if rank == 0:
            if i <= 1:
                env = Monitor(task.make_env(ENV_NAME),
                              f'{EXPERIMENTS_PREFIX}log')
                learn_params = AGENT_CONFIG['learn_params']
                checkpoint_callback = CheckpointCallback(
                    save_freq=learn_params['alg_checkpoint_interval'],
                    save_path=
                    f'{ALG_OUT_PREFIX}checkpoint{os.sep}checkpoint-alg')
                eval_env = task.make_env(ENV_NAME)
                eval_callback = EvalCallback(
                    eval_env,
                    best_model_save_path=
                    f'{ALG_OUT_PREFIX}checkpoint{os.sep}eval-best-alg',
                    log_path=f'{EXPERIMENTS_PREFIX}log{os.sep}checkpoint',
                    eval_freq=learn_params['eval_interval'])
                cb = CallbackList([checkpoint_callback, eval_callback])

                use_wann = config['USE_WANN']
                if use_wann:
                    wVec, aVec, _ = wnet.importNet(
                        f'{WANN_OUT_PREFIX}_best.out')
                else:
                    wVec, aVec = None, None

                # TODO: save/load if on wann or SAC optimize step for prev experiment starts
                if GAME_CONFIG.alg_type == task.ALG.SAC:
                    if config['USE_PREV_EXPERIMENT']:
                        alg = SAC.load(
                            f'{config["PREV_EXPERIMENT_PATH"]}{os.sep}alg'
                        )  # TODO: load SAC model here
                    else:
                        alg = SAC(
                            AGENT_CONFIG['policy'],
                            env,
                            verbose=learn_params['log_verbose'],
                            tensorboard_log=
                            f'{EXPERIMENTS_PREFIX}log{os.sep}tb-log',
                            buffer_size=learn_params['mem_size'],
                            learning_rate=learn_params['learn_rate'],
                            learning_starts=learn_params['start_steps'],
                            batch_size=learn_params['train_batch_size'],
                            tau=learn_params['tau'],
                            gamma=learn_params['gamma'],
                            train_freq=learn_params['n_trains_per_step'],
                            target_update_interval=learn_params[
                                'replay_sample_ratio'],
                            gradient_steps=learn_params[
                                'gradient_steps_per_step'],
                            n_episodes_rollout=learn_params['episode_len'],
                            target_entropy=learn_params['target_entropy'],
                            device=device,
                            use_wann=use_wann,
                            wVec=wVec,
                            aVec=aVec)
                else:
                    raise Exception(
                        f'Algorithm configured is not currently supported')

            # if alg is not None and use_wann:
            #     alg.sync_buffer()

            if i > 1:
                alg.learning_starts = 0

            if i % LOG_INTERVAL == 0:
                log.info(
                    f'performing learning step {i}/{config["NUM_EPOCHS"]} complete...'
                )
            log.info('PERFORMING ALG TRAIN STEP')
            alg.learn(total_timesteps=learn_params['timesteps'],
                      log_interval=learn_params['log_interval'],
                      callback=cb)
            alg.save(
                f'{ALG_OUT_PREFIX}checkpoint{os.sep}full-run-checkpoint{os.sep}checkpoint-step-{i}'
            )
        else:
            return  # return if subprocess

        if i % LOG_INTERVAL == 0:
            log.info(f'step {i}/{config["NUM_EPOCHS"]} complete')

    if rank == 0:  # if main process
        if config["RENDER_TEST_GIFS"]:
            vid_len = config['VIDEO_LENGTH']

            render_agent(alg,
                         ENV_NAME,
                         vid_len,
                         SAVE_GIF_PATH,
                         filename=f'{config["EXPERIMENT_ID"]}-agent.gif')
            render_agent(alg,
                         ENV_NAME,
                         vid_len,
                         SAVE_GIF_PATH,
                         filename='random.gif')

        if use_wann:
            wtrain.run(None, kill_slaves=True)
Exemple #18
0
def main():

    set_random_seed(RANDOM_SEED)

    t_start = time()
    name = "LargeFinalLayer"

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)
    os.makedirs(checkpoint_path, exist_ok=True)

    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    env_args = dict(
        frame_skip=4,
        screen_size=84,
        terminal_on_life_loss=True,
        clip_reward=True,
    )

    # Creates a gym environment for an atari game using the specified seed and number of environments
    # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
    # for improved performance..
    # train_env = make_atari_env(ENV_NAME, n_envs=N_ENVS, seed=RANDOM_SEED, wrapper_kwargs=env_args)

    def atari_wrapper(env: gym.Env) -> gym.Env:
        env = AtariWrapper(env, **env_args)
        return env

    def make_env(rank: int, count: int) -> VecEnv:
        return make_vec_env(
            ENV_NAME,
            n_envs=count,
            seed=RANDOM_SEED + rank,
            start_index=0,
            monitor_dir=None,
            wrapper_class=atari_wrapper,
            env_kwargs=None,
            vec_env_cls=None,
            vec_env_kwargs=None,
            monitor_kwargs=None,
        )

    train_env = make_env(0, N_ENVS)
    eval_env = make_env(1, 1)

    # required by models in baselines
    train_env = VecTransposeImage(train_env)
    eval_env = VecTransposeImage(eval_env)

    # setup callback to save model at fixed intervals
    save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                       save_path=checkpoint_path,
                                       name_prefix=name)
    stop_callback = StopTrainingOnRewardThreshold(
        reward_threshold=EVAL_THRESHOLD)
    time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
    best_callback = EvalCallback(
        eval_env,
        eval_freq=EVAL_FREQ,
        best_model_save_path=checkpoint_path,
        callback_on_new_best=stop_callback,
    )
    list_callback = CallbackList([save_callback, best_callback, time_callback])

    model = PPO(
        CnnPolicy,
        train_env,
        verbose=VERBOSE,
        batch_size=BATCH_SIZE,
        seed=RANDOM_SEED,
        tensorboard_log=log_path,
        learning_rate=LEARNING_RATE,
        n_steps=UPDATE_STEPS,
        n_epochs=N_EPOCHS,
        ent_coef=ENT_COEF,
        vf_coef=VF_COEF,
        clip_range=CLIP_RANGE,
        device=DEVICE_TYPE,
        policy_kwargs=dict(features_extractor_class=FeatureExtractor),
    )

    config_path = os.path.join(checkpoint_path, "cnn_config")
    zip_path = os.path.join(checkpoint_path, "model.zip")

    # output the model config to a file for easier viewing
    with open(config_path, "w") as file:
        file.write(f"{name}\n")
        file.write(str(model.policy.features_extractor.cnn))

    print("Beginning training...")

    model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
    # model.learn(TRAIN_STEPS, tb_log_name="run")
    model.save(zip_path)

    del train_env
    # del eval_env

    time_taken = time() - t_start

    print("Beginning evaluation...")

    # score of the game, standard deviation of multiple runs
    reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

    with open(results_path, "w") as handle:
        handle.write(json.dumps((reward_mean, reward_std, time_taken)))
Exemple #19
0
def evaluate(individual: Individual,
             device: Union[torch.device, str] = "auto") -> Tuple[int]:
    """
    Evaluate a single individual model and return it's mean score after the training time is elapsed.
    Models are trained and evaluated for a number of timestamps as parameterized in the constants at the
    top of the file.
    :param individual: The individual to evaluate.
    :return:
    """

    t_start = time()
    layers = individual.weights
    name = individual.encode()

    checkpoint_path = os.path.join(BASE_CHECKPOINT_PATH, "PPO", ENV_NAME, name)

    if os.path.exists(checkpoint_path):
        return (random.randint(MIN_SCORE, MAX_SCORE), )

    os.makedirs(checkpoint_path, exist_ok=True)
    log_path = os.path.join(BASE_LOG_PATH, "PPO", ENV_NAME, name)
    os.makedirs(log_path, exist_ok=True)

    results_path = os.path.join(checkpoint_path, "results.json")

    if not os.path.exists(results_path):
        env_args = dict(
            frame_skip=4,
            screen_size=84,
            terminal_on_life_loss=True,
            clip_reward=True,
        )

        # Creates a gym environment for an atari game using the specified seed and number of environments
        # This is a "vectorized environment", which means Stable Baselines batches the updates into vectors
        # for improved performance..
        def atari_wrapper(env: gym.Env) -> gym.Env:
            env = AtariWrapper(env, **env_args)
            return env

        def make_env(rank: int, count: int) -> VecEnv:
            return make_vec_env(
                ENV_NAME,
                n_envs=count,
                seed=RANDOM_SEED + rank,
                start_index=0,
                monitor_dir=None,
                wrapper_class=atari_wrapper,
                env_kwargs=None,
                vec_env_cls=SubprocVecEnv,
                vec_env_kwargs=None,
                monitor_kwargs=None,
            )

        train_env = make_env(0, N_ENVS)
        eval_env = make_env(1, 1)

        # required by models in baselines
        train_env = VecTransposeImage(train_env)
        eval_env = VecTransposeImage(eval_env)

        # setup callback to save model at fixed intervals
        save_callback = CheckpointCallback(save_freq=CHECKPOINT_FREQ,
                                           save_path=checkpoint_path,
                                           name_prefix=name)
        stop_callback = StopTrainingOnRewardThreshold(
            reward_threshold=EVAL_THRESHOLD)
        time_callback = TimeLimitCallback(max_time=TIME_LIMIT)
        best_callback = EvalCallback(
            eval_env,
            eval_freq=EVAL_FREQ,
            best_model_save_path=checkpoint_path,
            callback_on_new_best=stop_callback,
        )
        list_callback = CallbackList(
            [save_callback, best_callback, time_callback])

        model = PPO(
            CnnPolicy,
            train_env,
            verbose=VERBOSE,
            batch_size=BATCH_SIZE,
            seed=RANDOM_SEED * 7,
            tensorboard_log=log_path,
            learning_rate=LEARNING_RATE,
            n_steps=UPDATE_STEPS,
            n_epochs=N_EPOCHS,
            ent_coef=ENT_COEF,
            vf_coef=VF_COEF,
            clip_range=CLIP_RANGE,
            device=device,
            policy_kwargs=dict(features_extractor_class=VariableBenchmark,
                               features_extractor_kwargs=dict(layers=layers)),
        )

        config_path = os.path.join(checkpoint_path, "cnn_config")
        zip_path = os.path.join(checkpoint_path, "model.zip")

        # output the model config to a file for easier viewing
        with open(config_path, "w") as file:
            file.write(f"{name}\n")
            file.write(str(model.policy.features_extractor.cnn))

        print("Beginning training...")

        model.learn(TRAIN_STEPS, callback=list_callback, tb_log_name="run")
        model.save(zip_path)

        del train_env
        del eval_env

        time_taken = time() - t_start

        print("Beginning evaluation...")

        # score of the game, standard deviation of multiple runs
        reward_mean, reward_std = evaluate_policy(model, make_env(2, 1))

        with open(results_path, "w") as handle:
            handle.write(json.dumps((reward_mean, reward_std, time_taken)))
    else:
        reward_mean, reward_std, time_taken = json.load(open(
            results_path, "r"))

    reward_mean = abs(MIN_SCORE) + reward_mean
    value = (reward_mean * weighted_time(time_taken), )

    print(f"Evaluated {name} with a score of {value}  in {(time_taken):.2f}s")

    return value
for task in reward_threshold.keys():
    TASK_NAME = task
    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/',
                                             name_prefix='rl_model')
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100 * 150 /2, verbose=1)
    env = gym.make(TASK_NAME)

    log_dir = "./logs"

    env_m = monitor.Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env_m])
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    # Stop training when the model reaches the reward threshold
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold[TASK_NAME], verbose=1)
    eval_callback = EvalCallback(env, callback_on_new_best=callback_on_best, verbose=1)
    callback = CallbackList([callback_max_episodes, eval_callback])
   
    model = A2C('MlpPolicy', env, verbose=1,policy_kwargs=dict(net_arch=model_def))
    st = time.time()
    model.learn(total_timesteps=100 * 150 * 10000, callback=callback)
    elapse_time = time.time() - st

    with open("./outdir/"+TASK_NAME + ".plt", "wb") as fd:
        chkpt = {
            "elapse_time": elapse_time,
            "reward_threshold" : reward_threshold,
            "reward_list" : env_m.get_episode_rewards(),
            "timestep_list": env_m.get_episode_lengths(),
            "runtime_list" : env_m.get_episode_times(),
            "totall_steps": env_m.get_total_steps()
        }
Exemple #21
0
def main():
    if(StartFresh):
        # Create Environment
        env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log)

    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks) #, callback=callback, =TensorboardCallback()

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
def main():
    if(StartFresh):
        # Create Environment
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto")
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])])

        model = PPO('MlpPolicy', 
            env, 
            learning_rate = 3e-5,
            n_steps=512,
            batch_size=128,
            n_epochs=20,
            gamma=0.99,
            gae_lambda = 0.9,
            clip_range = 0.4,
            vf_coef = 0.5,
            use_sde = True,
            sde_sample_freq = 4,
            policy_kwargs = policy_kwargs, 
            verbose=1, 
            tensorboard_log=tb_log,
            device="auto")


    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        # nStep_callback_list = CallbackList([envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])
        # callbacks = CallbackList([checkpoint_callback, eval_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks)

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
Exemple #23
0
        n_epochs=
        10,  # 10, # number of passes to do over the whole rollout buffer (of size 2048*n_cpus) during one training iter
        create_eval_env=False,  # todo
        seed=None,
        verbose=2,
        tensorboard_log="./ppo_logs/")

    # evaluate
    # mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
    # print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

    # save a checkpoint every n steps
    checkpoint_callback = CheckpointCallback(save_freq=49_000,
                                             save_path='./ppo_checkpoints/',
                                             name_prefix='debug_model')
    callbacks = CallbackList([checkpoint_callback, CustomCallback()])
    # cf /Users/nathan/opt/anaconda3/envs/vae/lib/python3.7/site-packages/stable_baselines3/common/callbacks.py
    # to make own checkpoints to have more control
    # the save_freq of cp_callback here doesnt take parallelism into account, so like 10k train steps with
    # 3 agents will only be 3333 steps for cp callback not enough to reach 5k (save freq)

    # train
    model.learn(total_timesteps=10_000_000, callback=callbacks)

    # evaluate
    # mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
    # print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

    # ------------------------------------------------------------------------------------------------------
    # model = PPO.load("model_save")
    # env = CoinrunEnv()