Example #1
0
def create_env(env_name, normalized, Training=False):
    env = gym.make(env_name)

    if normalized:
        from stable_baselines.common.vec_env import VecNormalize, DummyVecEnv
        vec_env = DummyVecEnv([lambda: env])
        vec_env = VecNormalize.load('data/models/env_stats/'+env_name+'.pkl',
                            venv=vec_env)
        vec_env.training = Training
        vec_env.reward_range = env.reward_range

    return env
def main():

    args = get_args()
    choose_device(args.device)
    set_global_seeds(args.seed)

    env_id = args.env
    exp_id = args.exp_id
    algo = args.algo
    env_name = env_id[:-3]
    env_index = env_list.index(env_id)

    # Pass CustomEnv arguments: follow this for your CustomEnv if reward not known prior to training
    env_kwargs = {} if args.env_kwargs is None else args.env_kwargs
    if (args.env_kwargs is not None) and (env_id in ['AirSim-v0']):
        if 'rew_land' in env_kwargs:
            if (int(env_kwargs['rew_land']) in [500, 1000, 10000]):
                env_success[-1] = int(env_kwargs['rew_land'])
            else:
                raise ValueError(
                    'Given env reward not acceptable. Please try again')

    params = [exp_id, env_name.lower()]
    folder = [exp_id, env_name.lower(), args.algo.lower()]
    tensorboard_path, monitor_path, callback_path = None, None, None

    if args.tensorboard:
        tensorboard_path = "tensorboard/{}_{}".format(*params)
        make_dir(tensorboard_path)

    # if args.train_RL: # Begin training here (location of this condition also decides experiment performance)

    # Load hyperparameters from yaml file
    with open('hyperparams/{}.yml'.format(args.algo), 'r') as f:
        hyperparams_dict = yaml.safe_load(f)
        if env_id in list(hyperparams_dict.keys()):
            hyperparams = hyperparams_dict[env_id]
        else:
            raise ValueError("Hyperparameters not found for {}-{}".format(
                args.algo, env_id))

    if args.hyperparams is not None:
        # Overwrite hyperparams if needed
        hyperparams.update(args.hyperparams)

    # OPTIONAL: Print saved hyperparams
    saved_hyperparams = OrderedDict([(key, hyperparams[key])
                                     for key in sorted(hyperparams.keys())])
    if args.verbose > 0:
        pprint(saved_hyperparams)

    if args.n_envs > 1:
        # if args.verbose:
        print("Overwriting n_envs with n={}".format(args.n_envs))
        n_envs = args.n_envs
    else:
        n_envs = hyperparams.get('n_envs', 1)

    # choose Monitor log path according to multiprocessing setting
    if args.monitor:
        if n_envs == 1:
            monitor_path = 'logs/single/{}_{}_{}'.format(*folder)
        else:
            if algo not in ['dqn', 'her', 'sac', 'td3']:
                monitor_path = 'logs/multi/{}_{}_{}'.format(*folder)
        make_dir(monitor_path)

    if int(float(args.timesteps_RL)) > 0:
        # if args.verbose:
        print("Overwriting n_timesteps with n={}".format(
            int(float(args.timesteps_RL))))
        n_timesteps = int(float(args.timesteps_RL))
    else:
        n_timesteps = int(hyperparams['n_timesteps'])

    # Convert to python object if needed
    if 'policy_kwargs' in hyperparams.keys() and isinstance(
            hyperparams['policy_kwargs'], str):
        hyperparams['policy_kwargs'] = eval(hyperparams['policy_kwargs'])

    if 'n_envs' in hyperparams.keys():
        del hyperparams['n_envs']
    del hyperparams['n_timesteps']  #To avoid error

    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    # if (algo=='ppo2' and ('learning_rate' in hyperparams.keys())):
    #     hyperparams['learning_rate'] = linear_schedule(hyperparams['learning_rate'])

    def create_env(n_envs, eval_env=False):
        if algo in ['a2c', 'acer', 'acktr', 'ppo2']:
            if n_envs > 1:
                env = SubprocVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            else:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             log_dir=monitor_path,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs)
                ])
            env = DummyVecEnv([lambda: gym.make(env_id, **env_kwargs)])
            if env_wrapper is not None:
                env = env_wrapper(env)
        elif ((algo in ['dqn', 'her', 'sac', 'td3']) and n_envs > 1):
            raise ValueError(
                "Error: {} does not support multiprocessing!".format(algo))
        elif ((algo in ['ddpg', 'ppo1', 'trpo', 'gail']) and n_envs > 1):
            raise ValueError(
                "Error: {} uses MPI for multiprocessing!".format(algo))
        else:
            env = make_vec_env(env_id,
                               n_envs=n_envs,
                               seed=args.seed,
                               monitor_dir=monitor_path,
                               wrapper_class=env_wrapper,
                               env_kwargs=env_kwargs)

        if args.normalize:  # choose from multiple options
            # env = VecNormalize(env, clip_obs=np.inf)
            env = VecNormalize(env, norm_reward=False, clip_obs=np.inf)
            # env = VecNormalize(env, norm_reward=False, clip_obs=np.inf, **normalize_kwargs)
        return env

    # Zoo: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs)])
    # Zoo: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs)])
    env = create_env(n_envs)

    # if args.train_RL: # checking impact of the if-condition position on experiment reproducibility

    callback, callback_path = [], "callbacks/{}_{}_{}".format(*folder)
    save_freq, eval_freq = 100 * episode_len[env_index], 100 * episode_len[
        env_index]
    save_freq, eval_freq = max(save_freq // n_envs,
                               1), max(eval_freq // n_envs, 1)
    make_dir(callback_path)
    if args.check_callback:
        callback.append(
            CheckpointCallback(save_freq=save_freq,
                               save_path=callback_path,
                               name_prefix='rl_model',
                               verbose=1))
    if args.eval_callback:
        callback.append(
            EvalCallback(create_env(1, eval_env=True),
                         best_model_save_path=callback_path,
                         log_path=callback_path,
                         eval_freq=eval_freq,
                         verbose=1))

    model = (algo_list[args.algo])(env=env,
                                   seed=args.seed,
                                   tensorboard_log=tensorboard_path,
                                   n_cpu_tf_sess=1,
                                   verbose=args.verbose,
                                   **hyperparams)
    print('\nTraining {} on {} now... \n'.format(algo, env_id))

    start_time = time.time()
    model.learn(total_timesteps=n_timesteps, callback=callback)
    total_time = time.time() - start_time

    if args.normalize:
        env.save(os.path.join(callback_path, "vec_normalize.pkl"))

    if n_envs > 1 or (algo in ['ddpg', 'trpo', 'gail']):
        print("Took {:.2f}s for multiprocessed version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))
    else:
        print("Took {:.2f}s for single process version - {:.2f} FPS".format(
            total_time, n_timesteps / total_time))

    env = DummyVecEnv([make_env(env_id, 0, args.seed, env_kwargs=env_kwargs)])

    if args.normalize:
        env = VecNormalize.load(
            os.path.join(callback_path, "vec_normalize.pkl"), env)
        env.training = False
        env.norm_reward = False
        env.seed(args.seed)

    # Evaluate RL model - choose either best model or last available model
    model = (algo_list[algo]).load(os.path.join(callback_path, 'best_model'))
    # model = (algo_list[algo]).load("models/{}_{}_{}".format(*folder))
    model.set_env(env)
    evaluate('policy', model, env_id, env, algo, 100)

    if args.monitor:
        results_plotter.plot_results([monitor_path], n_timesteps,
                                     results_plotter.X_TIMESTEPS,
                                     "{} {}".format(algo, env_id))
        plot_results(monitor_path)

    if args.test:
        print('\nTesting policy...\n')
        obs = env.reset()
        for _ in range(n_timesteps):
            action, _states = model.predict(obs, deterministic=True)
            if isinstance(env.action_space, gym.spaces.Box):
                action = np.clip(action, env.action_space.low,
                                 env.action_space.high)
            obs, rewards, dones, info = env.step(action)
            episode_reward += rewards
            env.render()
            if dones:
                done_count += 1
                success_count = check_success(env_index, env_success,
                                              success_count)
                total_reward += episode_reward
                episode_reward = 0
                env.reset()
        print('\n{}/{} successful episodes'.format(success_count, done_count))
        average_reward = total_reward / done_count
        print('\nAverage reward: {}'.format(average_reward))
        env.close()
Example #3
0
def normalize_env(
    env,
    orig_log_dir,
    sb_version,
    vectorize=True,
    continue_learning=False,
    evaluate=False,
    evaluate_during_learning=False,
    normalize_kwargs=None,
):
    if vectorize:
        env = DummyVecEnv([lambda: env])

    logger.debug("Normalize: {}".format(normalize_kwargs))
    if evaluate:
        # FIXME in continue learning training should be True so that we update the running average of obs and
        #  rewards with new samples; if I do that, the algo performs very poorly even with no changes in the env
        if sb_version == "sb3":
            env = VecNormalize3(env, training=False, **normalize_kwargs)
        else:
            env = VecNormalize(env, training=False, **normalize_kwargs)

        if not evaluate_during_learning or continue_learning:
            if not os.path.exists(
                    os.path.join(orig_log_dir, "vecnormalize.pkl")):
                env_name = get_env_name(env=env.unwrapped,
                                        sb_version=sb_version)
                index_last_separator = orig_log_dir.rindex("/")
                new_orig_log_dir = os.path.join(
                    orig_log_dir[0:index_last_separator], "logs_" + env_name)
                logger.debug(
                    "{} does not exist. Trying to search it in the original model directory {}"
                    .format(os.path.join(orig_log_dir, "vecnormalize.pkl"),
                            new_orig_log_dir))
                assert os.path.exists(new_orig_log_dir), "{} does not exist"
                assert os.path.exists(
                    os.path.join(new_orig_log_dir, "vecnormalize.pkl")), (
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl") +
                        " does not exist")
                logger.debug("[evaluate] Loading {}".format(
                    os.path.join(new_orig_log_dir, "vecnormalize.pkl")))
                if sb_version == "sb3":
                    env = VecNormalize3.load(
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl"),
                        env)
                else:
                    env = VecNormalize.load(
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl"),
                        env)
            else:
                logger.debug("[evaluate] Loading {}".format(
                    os.path.join(orig_log_dir, "vecnormalize.pkl")))
                if sb_version == "sb3":
                    env = VecNormalize3.load(
                        os.path.join(orig_log_dir, "vecnormalize.pkl"), env)
                else:
                    env = VecNormalize.load(
                        os.path.join(orig_log_dir, "vecnormalize.pkl"), env)

        # Deactivate training and reward normalization
        env.training = False
        env.norm_reward = False

    elif continue_learning:
        # FIXME: don't know why but during continue learning I have to disable training otherwise performance
        #  is not the same as in the model trained from scratch even without changing the params of the environment.
        #  in rl-baselines-zoo this is not done during continue learning:
        #  https://github.com/araffin/rl-baselines-zoo/blob/master/train.py#L365
        if sb_version == "sb3":
            env = VecNormalize3(env, training=False, **normalize_kwargs)
        else:
            env = VecNormalize(env, training=False, **normalize_kwargs)

        assert os.path.exists(os.path.join(
            orig_log_dir, "vecnormalize.pkl")), (
                os.path.join(orig_log_dir, "vecnormalize.pkl") +
                " does not exist")
        logger.debug("[continue_learning] Loading {}".format(
            os.path.join(orig_log_dir, "vecnormalize.pkl")))
        if sb_version == "sb3":
            env = VecNormalize3.load(
                os.path.join(orig_log_dir, "vecnormalize.pkl"), env)
        else:
            env = VecNormalize.load(
                os.path.join(orig_log_dir, "vecnormalize.pkl"), env)

    else:
        if sb_version == "sb3":
            env = VecNormalize3(env, **normalize_kwargs)
        else:
            env = VecNormalize(env, **normalize_kwargs)

    return env
if __name__ == '__main__':
    log_dir = 'models/hover/empty_world_small/finalVec'
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env_id = 'CrazyflieObstacleEval-v0'

    # Load the agent
    model = PPO2.load(log_dir + '/ppo2_final')

    # Load the saved statistics
    env = DummyVecEnv([
        lambda: gym.make(env_id, n_obstacles=1, avoidance_method='Heuristic')
    ])
    env = VecNormalize.load(stats_path, env)
    #  do not update them at test time
    env.training = False
    # reward normalization is not needed at test time
    env.norm_reward = False

    eval_episodes = 50

    total_goals_reached = 0
    total_collisions = 0
    total_flips = 0
    total_steps_exceeded = 0
    total_potential_collisions = 0
    total_collisions_avoided = 0
    total_timsteps = 0

    # Observe trained agent
    for i_episode in range(eval_episodes):