Example #1
0
def stable_baseline_training(algorithm, steps, number_maps, random_spawn,
                             random_textures, lstm, random_keys, keys,
                             dimensions, num_cpus, n_stack, clip, complexity,
                             density, mp, eval_occurrence, eval_episodes, eval,
                             experiment_name, env_seed, alg_seed,
                             episode_timeout):
    """
    Runs OpenAI stable baselines implementation of specified algorithm on specified environment with specified training configurations.
    Note: For scenarios not using MazeExplorer but using the vizdoom wrapper, the .T transpose on the array being fed into the process image method needs to be removed.
    Note: Ensure relevant maps are in specified paths under mazes folder for simpler and manual scenarios.

    :param algorithm: which algorithm to run (currently support for PPO and A2C)
    :param steps: number of steps to run training
    :param number_maps: number of maps to generate and train on
    :param random_spawn: whether or not to randomise the spawn position of the agent
    :param random_textures: whether or not to randomise textures in generated maps
    :param lstm: whether or not to add an lstm to the network
    :param random_keys: whether to randmise key placement upon each new training episode in a given map
    :param keys: number of keys to place in generated maps
    :param dimensions: x, y dimensions of maps to be generated
    :param num_cpus: number of environments in which to train
    :param n_stack: number of frames to stack to feed as a state to the agent
    :param clip: whether or not to clip rewards from the environment
    :param complexity: float between 0 and 1 describing the complexity of the generated mazes
    :param density: float between 0 and 1 describing the density of the generated mazes
    :param mp: whether or not to use multiprocessing for workers
    :param eval_occurrence: parameter specifying period of running evaluation 
    :param eval_episodes: number of times to perform episode rollout during evaluation 
    :param eval: whether or not to use evaluation during training
    :param experiment_name: name of experiment for use in logging and file saving
    :param env_seed: seed to be used for environment generation
    :param alg_seed: seed to be used for stable-baseline algorithms
    :param episode_timeout: number of steps after which to terminate episode
    """

    timestamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H-%M-%S')

    # generate a file in log directory containing training configuration information.
    experiment_name = experiment_name + "/" if experiment_name else ""
    OUTPUT_PATH = os.path.join(DIR_PATH, 'results', experiment_name, timestamp)
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    with open(os.path.join(OUTPUT_PATH, 'params.txt'), 'w+') as f:
        f.write(
            str({
                'algorithm': algorithm,
                'number_maps': number_maps,
                'random_spawn': random_spawn,
                'random_textures': random_textures,
                'lstm': lstm,
                'random_keys': random_keys,
                'keys': keys,
                'dimensions': dimensions,
                'num_cpus': num_cpus,
                'clip': clip,
                'mp': mp,
                'n_stack': n_stack,
                'env_seed': env_seed,
                'alg_seed': alg_seed,
                'experiment_name': experiment_name,
                "eval_occurrence": eval_occurrence,
                "eval_episodes": eval_episodes,
                "episode_timeout": episode_timeout
            }))

    if clip:
        clip_range = (-1, 1)
    else:
        clip_range = False

    mazeexplorer_env = MazeExplorer(number_maps=number_maps,
                                    random_spawn=random_spawn,
                                    random_textures=random_textures,
                                    random_key_positions=random_keys,
                                    keys=keys,
                                    size=dimensions,
                                    clip=clip_range,
                                    seed=env_seed,
                                    complexity=complexity,
                                    density=density)

    if mp:
        env = SubprocVecEnv(
            [mazeexplorer_env.create_env() for _ in range(num_cpus)])
    else:
        env = DummyVecEnv([
            mazeexplorer_env.create_env() for _ in range(num_cpus)
        ])  # vectorise env

    if n_stack > 0:
        env = VecFrameStack(env, n_stack=n_stack)

    if algorithm == 'ppo':
        algo = PPO2
    elif algorithm == 'a2c':
        algo = A2C
    else:
        raise NotImplementedError("Only supports PPO and A2C")

    if lstm:
        model = algo(CnnLstmPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=OUTPUT_PATH)
    else:
        model = algo(CnnPolicy, env, verbose=1, tensorboard_log=OUTPUT_PATH)

    if eval:
        evaluator = Evaluator(os.path.join(DIR_PATH, "eval_maps"), OUTPUT_PATH,
                              num_cpus, mp, n_stack)

        steps_taken = 0

        print("Training started...")

        while steps_taken < steps:
            print("Training...")
            model.learn(total_timesteps=min(eval_occurrence,
                                            (steps - steps_taken)),
                        reset_num_timesteps=False,
                        seed=alg_seed)

            steps_taken += eval_occurrence

            print("Evaluating...")

            evaluator.evaluate(model, steps_taken, eval_episodes,
                               save=True)  # do 100 rollouts and save scores

        print("Training completed.")

    else:
        model.learn(total_timesteps=steps, seed=alg_seed)
Example #2
0
import pacman
import gym
import numpy as np
import time

from stable_baselines.common.policies import MlpPolicy, CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, DQN

from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack

# env = gym.make('MsPacman-v0')
env = make_atari_env('MsPacmanNoFrameskip-v0', num_env=1, seed=0)
# env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)
model = PPO2(CnnPolicy,
             env,
             verbose=1,
             vf_coef=1,
             n_steps=128,
             tensorboard_log="./logs/baseline_MDP")

eval_scores = []
for ep in range(1):
    obs = env.reset()
    total_reward = 0
    done = False
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
Example #3
0
def init():
    env = make_atari_env(id, num_env=num_env, seed=seed)
    env = VecFrameStack(env, n_stack=n_stack)
    Globals.env = RewardWrapper(env)
tensorboard_folder = './tensorboard/Pacman/action_mask/'
model_folder = './models/Pacman/base/'
if not os.path.isdir(tensorboard_folder):
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

if __name__ == '__main__':
    env = DummyVecEnv([lambda: ActionMaskEnv() for i in range(4)])
    env = VecFrameStack(env, 3)

    model = PPO2.load(model_folder + "PPO2" + model_tag)

    done = [False, False, False, False]
    states = None
    action_masks = []
    obs = env.reset()

    while not done[0]:
        action, states = model.predict(obs, states, action_mask=action_masks)
        obs, _, done, infos = env.step(action)
        env.render()
        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
Example #5
0
def create_test_env(env_id, n_envs=1, n_agents=1, is_atari=False,
                    stats_path=None, norm_reward=False, seed=0,
                    log_dir='', should_render=True):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param n_agents: (int) number of agents per enviroment
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param norm_reward: (bool) Whether to normalize rewards or not when using Vecnormalize
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    # Create the environment and wrap it if necessary
    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        env = SubprocVecEnv([make_env(env_id, i, seed, log_dir) for i in range(n_envs)])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        use_subproc = 'renders' not in inspect.getfullargspec(class_.__init__).args

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, renders=should_render)
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir)])
        else:
            env = DummyVecEnv([_init])
    elif 'Marathon' in env_id:
        from UnityVecEnv import UnityVecEnv
        # env = UnityVecEnv(env_id, inference_mode=True)
        from gym_unity.envs import UnityEnv
        env_path = UnityVecEnv.GetFilePath(env_id, inference_mode=True)
        env = UnityEnv(env_path)
        env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    else:
        env = DummyVecEnv([make_env(env_id, 0, seed, log_dir)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if os.path.join(stats_path, 'obs_rms.pkl'):
            print("Loading running average")
            env = VecNormalize(env, training=False, norm_reward=norm_reward)
            env.load_running_average(stats_path)

        n_stack_file = os.path.join(stats_path, 'n_stack')
        if os.path.isfile(n_stack_file):
            with open(n_stack_file, 'r') as f:
                n_stack = int(f.read())
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Example #6
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    if hyperparams is None:
        hyperparams = {}

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper)
            for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        spec = gym.envs.registry.env_specs[env_id]
        try:
            class_ = load(spec.entry_point)
        except AttributeError:
            # Backward compatibility with gym
            class_ = load(spec._entry_point)
        # HACK: force SubprocVecEnv for Bullet env that does not
        # have a render argument
        render_name = None
        use_subproc = 'renders' not in inspect.getfullargspec(
            class_.__init__).args
        if not use_subproc:
            render_name = 'renders'
        # Dev branch of pybullet
        # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args
        # if not use_subproc and render_name is None:
        #     render_name = 'render'

        # Create the env, with the original kwargs, and the new ones overriding them if needed
        def _init():
            # TODO: fix for pybullet locomotion envs
            env = class_(**{**spec._kwargs}, **{render_name: should_render})
            env.seed(0)
            if log_dir is not None:
                env = Monitor(env,
                              os.path.join(log_dir, "0"),
                              allow_early_resets=True)
            return env

        if use_subproc:
            env = SubprocVecEnv([
                make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)
            ])
        else:
            env = DummyVecEnv([_init])
    else:
        env = DummyVecEnv(
            [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])
            env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Example #7
0
parser.add_argument('--val-interval', type=int, default=100)
parser.add_argument('--val-episodes', type=int, default=1)
parser.add_argument('--num-epochs', type=int, default=50)
args = parser.parse_args()

set_global_seeds(args.seed)

logger.configure(os.path.join('logs', args.env, args.note))

dataset = ExpertDataset(expert_path=args.expert,
                        batch_size=128,
                        train_fraction=0.99,
                        verbose=1)

if 'NoFrameskip' in args.env:
    env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
else:
    import gym
    env = gym.make(args.env)

model = PPO2(args.policy, env, verbose=1)

# Pretrain the PPO2 model
# Data should be abundant, so train only one epoch
model.pretrain(dataset,
               peer=args.peer,
               val_interval=args.val_interval,
               val_episodes=args.val_episodes,
               n_epochs=args.num_epochs)

# As an option, you can train the RL agent
Example #8
0
        if isinstance(normalize, str):
            normalize_kwargs = eval(normalize)
            normalize = True
        del hyperparams['normalize']

    # Delete keys so the dict can be pass to the model constructor
    if 'n_envs' in hyperparams.keys():
        del hyperparams['n_envs']
    del hyperparams['n_timesteps']

    # Create the environment and wrap it if necessary
    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif args.algo in ['dqn', 'ddpg']:
        if hyperparams.get('normalize', False):
            print("WARNING: normalization not supported yet for DDPG/DQN")
        env = gym.make(env_id)
        env.seed(args.seed)
    else:
        if n_envs == 1:
            env = DummyVecEnv([make_env(env_id, 0, args.seed)])
        else:
            env = SubprocVecEnv(
                [make_env(env_id, i, args.seed) for i in range(n_envs)])
        if normalize:
            print("Normalizing input and return")
            env = VecNormalize(env, **normalize_kwargs)
Example #9
0
def snake_wrapper(env, stack_length=3):
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, stack_length)
    return env
Example #10
0
def create_env():
    """Creation of MsPacman environment"""
    env = make_atari_env("MsPacmanNoFrameskip-v0", num_env=16, seed=817)
    env = VecFrameStack(env, n_stack=4)
    return env 
Example #11
0
    def create_env(n_envs, eval_env=False, no_log=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :param no_log: (bool) Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: (Union[gym.Env, VecEnv])
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else save_path

        # Set initialzier and action type for environment, standard implementation currently does not support
        # custom types, so pass them here (kwargs is global, so do set again during repeated calls)
        if "initializer" in env_kwargs.keys() and isinstance(
                env_kwargs["initializer"], int):
            if env_kwargs["initializer"] == 0:
                env_kwargs["initializer"] = RandomInitializer(
                    env_kwargs.pop("difficulty"))
            elif env_kwargs["initializer"] == 1:
                env_kwargs["initializer"] = CompletelyRandomInitializer()
            else:
                raise RuntimeError('Unsupported initializer "{}"'.format(
                    env_kwargs["initializer"]))

        if "action_type" in env_kwargs.keys() and isinstance(
                env_kwargs["action_type"], int):
            if env_kwargs["action_type"] == "POSITION":
                env_kwargs["action_type"] = ActionType.POSITION
            elif env_kwargs["action_type"] == "TORQUE":
                env_kwargs["action_type"] = ActionType.TORQUE
            elif env_kwargs["action_type"] == "TORQUE_AND_POSITION":
                env_kwargs["action_type"] = ActionType.TORQUE_AND_POSITION
            else:
                raise RuntimeError('Unsupported Action Type"{}"'.format(
                    kwargs["action_type"]))
        else:
            env_kwargs["action_type"] = ActionType.POSITION

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id, **env_kwargs)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            if n_envs == 1:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             wrapper_class=env_wrapper,
                             log_dir=log_dir,
                             env_kwargs=env_kwargs)
                ])
            else:
                # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                # On most env, SubprocVecEnv does not help and is quite memory hungry
                env = DummyVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=log_dir,
                             wrapper_class=env_wrapper,
                             env_kwargs=env_kwargs) for i in range(n_envs)
                ])
            if normalize:
                # Copy to avoid changing default values by reference
                local_normalize_kwargs = normalize_kwargs.copy()
                # Do not normalize reward for env used for evaluation
                if eval_env:
                    if len(local_normalize_kwargs) > 0:
                        local_normalize_kwargs['norm_reward'] = False
                    else:
                        local_normalize_kwargs = {'norm_reward': False}

                if args.verbose > 0:
                    if len(local_normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            local_normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **local_normalize_kwargs)

        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
        if args.algo == 'her':
            # Wrap the env if need to flatten the dict obs
            if isinstance(env, VecEnv):
                env = _UnvecWrapper(env)
            env = HERGoalEnvWrapper(env)
        return env
Example #12
0
    import argparse
    import numpy as np
    from stable_baselines import PPO2, logger
    from stable_baselines.common.cmd_util import make_atari_env

    parser = argparse.ArgumentParser()
    parser.add_argument('expert', type=str, help='Expert path (*.zip)')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed for env.')
    parser.add_argument('--note',
                        type=str,
                        default='test',
                        help='Logging directory')
    parser.add_argument('--env',
                        type=str,
                        default='PongNoFrameskip-v4',
                        help='Environment ID')
    args = parser.parse_args()

    logdir = os.path.join('logs', args.env, args.note)
    logger.configure(logdir)
    logger.info(args)

    env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
    model = PPO2.load(args.expert)
    generate_expert_traj(model,
                         save_path=os.path.join(logdir, 'expert'),
                         env=env)
Example #13
0
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

torch.cuda.empty_cache()

seq_len = 4
batch_size = 64
num_envs = 32
seed = 0
n = 8
torch.manual_seed(seed)
np.random.seed(seed)
env = make_atari_env('BreakoutNoFrameskip-v4', num_env=num_envs, seed=seed)
env = VecFrameStack(env, n_stack=seq_len)

filename = "Breakout2"


class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(seq_len, 16, 8, stride=4)
        self.conv2 = nn.Conv2d(16, 32, 4, stride=2)
        self.lstm1 = nn.Linear(2592, 512)
        self.lin1 = nn.Linear(512, 128)
        self.q = nn.Linear(128, 1)
        self.lin2 = nn.Linear(128, 4)
        self.training_hidden = (
        torch.zeros((num_envs, seq_len, 128)).to(device), torch.zeros((num_envs, seq_len, 128)).to(device))
Example #14
0
def _create_vectorized_env(env_id, env_kwargs, n_envs, multiprocessing, seed,
                           log_dir, wrappers, normalize, frame_stack,
                           video_path, evaluation, scale, curiosity,
                           buffer_step_data, algorithm_name):
    if n_envs == 1:
        env = DummyVecEnv([
            make_env(env_id,
                     env_kwargs,
                     0,
                     seed,
                     log_dir,
                     wrappers,
                     evaluation=evaluation)
        ])
    else:
        if multiprocessing:
            env = SubprocVecEnv([
                make_env(env_id,
                         env_kwargs,
                         i,
                         seed,
                         log_dir,
                         wrappers,
                         evaluation=evaluation) for i in range(n_envs)
            ])
        else:
            env = DummyVecEnv([
                make_env(env_id,
                         env_kwargs,
                         i,
                         seed,
                         log_dir,
                         wrappers,
                         evaluation=evaluation) for i in range(n_envs)
            ])

    if video_path:
        env = VecImageRecorder(env, video_path, record_obs=True)

    if evaluation:
        env = VecEvaluationWrapper(env)

    # Add normalization wrapper for all algorithms except dqn here to save computations before frame stack
    if normalize and "dqn" not in algorithm_name:
        env = _add_normalization_wrapper(env, n_envs, normalize)

    if curiosity:
        if isinstance(curiosity, bool):
            env = CuriosityWrapper(env)
        else:
            if 'trained_agent' in curiosity:
                path = curiosity.pop('trained_agent')
                env = CuriosityWrapper.load(path, env, **curiosity)
                if len(env.int_rwd_rms.mean) != n_envs:
                    logging.warning(
                        "Skipping loading of curiosity wrapper due to a mismatch in numbers of environments ({} vs {})"
                        .format(len(env.int_ret), n_envs))
                    env = env.venv
            else:
                env = CuriosityWrapper(env, **curiosity)

    if scale:
        if isinstance(scale, dict):
            env = VecScaledFloatFrame(env, **scale)
        else:
            env = VecScaledFloatFrame(env)

    if frame_stack:
        env = VecFrameStack(env, **frame_stack)

    # Add normalization wrapper here to include frame stack when training with dqn.
    if normalize and "dqn" in algorithm_name:
        env = _add_normalization_wrapper(env, n_envs, normalize)

    if buffer_step_data:
        env = VecStepSave(env)

    return env
Example #15
0
        normalize_kwargs = eval(normalize)
        normalize = True
    del hyperparams['normalize']

# Delete keys so the dict can be pass to the model constructor
if 'n_envs' in hyperparams.keys():
    del hyperparams['n_envs']
del hyperparams['n_timesteps']

############### Create the environment and wrap it if necessary

if is_atari:
    print("Using Atari wrapper")
    env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
    # Frame-stacking with 4 frames
    env = VecFrameStack(env, n_stack=4)
    if not args.no_monitor:
        print("WARNING: monitor is not supported yet for atari env")
elif args.algo in ['dqn', 'ddpg']:
    if hyperparams.get('normalize', False):
        print("WARNING: normalization not supported yet for DDPG/DQN")
    env = gym.make(env_id)
    if len(env_params) > 0:
        env = modify_env_params(env, params_path, **env_params)
    elif len(params_ranges) > 0:
        env = RandomUniformEnvParams(env, params_path, params_ranges)
    env.seed(args.seed)
    if not args.no_monitor:
        env = Monitor(env, monitor_log, allow_early_resets=True)
else:
    if n_envs == 1:
Example #16
0
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack

from agent import ICMAgent
from runner import Runner
from utils import get_args

# constants


if __name__ == '__main__':

    """Argument parsing"""
    args = get_args()

    """Environment"""
    # create the atari environments
    # NOTE: this wrapper automatically resets each env if the episode is done
    env = make_atari_env(args.env_name, num_env=args.num_envs, seed=args.seed)
    env = VecFrameStack(env, n_stack=args.n_stack)

    """Agent"""
    agent = ICMAgent(args.n_stack, args.num_envs, env.action_space.n, lr=args.lr)


    """Train"""
    runner = Runner(agent, env, args.num_envs, args.n_stack, args.rollout_size, args.num_updates,
                    args.max_grad_norm, args.value_coeff, args.entropy_coeff,
                    args.tensorboard, args.log_dir, args.cuda, args.seed)
    runner.train()
Example #17
0
    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        :return: (gym.Env)
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id, **env_kwargs)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            # hacky way to get multiple gui outputs in test environments
            if "Test" in env_id:
                if n_envs == 1:
                    env = SubprocVecEnv([
                        make_env(env_id,
                                 0,
                                 args.seed,
                                 wrapper_class=env_wrapper,
                                 log_dir=log_dir,
                                 env_kwargs=env_kwargs)
                    ])
                else:
                    # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                    # On most env, SubprocVecEnv does not help and is quite memory hungry
                    env = SubprocVecEnv([
                        make_env(env_id,
                                 i,
                                 args.seed,
                                 log_dir=log_dir,
                                 wrapper_class=env_wrapper,
                                 env_kwargs=env_kwargs) for i in range(n_envs)
                    ])
            else:
                if n_envs == 1:
                    env = DummyVecEnv([
                        make_env(env_id,
                                 0,
                                 args.seed,
                                 wrapper_class=env_wrapper,
                                 log_dir=log_dir,
                                 env_kwargs=env_kwargs)
                    ])
                else:
                    # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                    # On most env, SubprocVecEnv does not help and is quite memory hungry
                    env = DummyVecEnv([
                        make_env(env_id,
                                 i,
                                 args.seed,
                                 log_dir=log_dir,
                                 wrapper_class=env_wrapper,
                                 env_kwargs=env_kwargs) for i in range(n_envs)
                    ])
            if normalize:
                if args.verbose > 0:
                    if len(normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **normalize_kwargs)
        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
            del hyperparams['frame_stack']
        return env
Example #18
0
def create_test_env(env_id, n_envs=1, is_atari=False,
                    stats_path=None, seed=0,
                    log_dir='', should_render=True, hyperparams=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param is_atari: (bool)
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original
                        env with
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :return: (gym.Env)
    """
    # HACK to save logs
    if log_dir is not None:
        os.environ["OPENAI_LOG_FORMAT"] = 'csv'
        os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
        os.makedirs(log_dir, exist_ok=True)
        logger.configure()

    if hyperparams is None:
        hyperparams = {}

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    elif n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])
    else:
        env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
    env = make_env(args.seed, vae=vae, teleop=args.teleop,
                   n_stack=hyperparams.get('frame_stack', 1))()

if normalize:
    if hyperparams.get('normalize', False) and args.algo in ['ddpg']:
        print("WARNING: normalization not supported yet for DDPG")
    else:
        print("Normalizing input and return")
        env = VecNormalize(env, **normalize_kwargs)

# Optional Frame-stacking
n_stack = 1
if hyperparams.get('frame_stack', False):
    n_stack = hyperparams['frame_stack']
    if not args.teleop:
        env = VecFrameStack(env, n_stack)
    print("Stacking {} frames".format(n_stack))
    del hyperparams['frame_stack']

# Parse noise string for DDPG
if args.algo == 'ddpg' and hyperparams.get('noise_type') is not None:
    noise_type = hyperparams['noise_type'].strip()
    noise_std = hyperparams['noise_std']
    n_actions = env.action_space.shape[0]
    if 'adaptive-param' in noise_type:
        hyperparams['param_noise'] = AdaptiveParamNoiseSpec(initial_stddev=noise_std,
                                                            desired_action_stddev=noise_std)
    elif 'normal' in noise_type:
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(n_actions),
                                                        sigma=noise_std * np.ones(n_actions))
    elif 'ornstein-uhlenbeck' in noise_type:
Example #20
0
    def get_rewards(self,
                    skills=[],
                    train_total_timesteps=5000000,
                    eval_times=100,
                    eval_max_steps=int(1e6),
                    model_save_name=None,
                    add_info={}):
        """
        
        :param skills: (list) the availiable action sequence for agent 
        e.g [[0,2,2],[0,1,1]]
        :param train_total_timesteps: (int)total_timesteps to train 
        :param eval_times: (int)the evaluation times
        e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode
        :param eval_max_steps: (int)maximum timesteps per episode when evaluate
        (deprecate):param model_save_name: (str)specify the name of saved model (should not repeat)
        :param add_info: (dict) other information to log in log.txt
        """

        if self.save_tensorboard and self.save_path is not None:
            tensorboard_log = os.path.join(self.save_path,
                                           "model_" + str(self._serial_num))
        else:
            tensorboard_log = None

        env_creator = lambda env: SkillWrapper(
            self.env_creator(env), skills=skills, gamma=self.gamma)

        if self.save_monitor is True:
            monitor_path = os.path.join(self.save_path, "monitor")
            try:
                os.makedirs(monitor_path)
            except OSError as ex:
                if ex.errno == errno.EEXIST and os.path.exists(monitor_path):
                    print("{} exists. ignore".format(monitor_path))
                    pass
                else:
                    raise
        else:
            monitor_path = None

        if "cfg" in self.env_id:

            env = make_doom_env(self.env_id,
                                self.num_cpu,
                                self.seed,
                                extra_wrapper_func=env_creator,
                                logdir=monitor_path)

        else:
            env = VecFrameStack(
                make_atari_env(self.env_id,
                               self.num_cpu,
                               self.seed,
                               extra_wrapper_func=env_creator,
                               logdir=monitor_path), 4)

        model = None
        if self.use_converge_parameter is True:
            model = self.model(self.policy,
                               env,
                               verbose=self.verbose,
                               tensorboard_log=tensorboard_log,
                               n_steps=128,
                               nminibatches=4,
                               lam=0.95,
                               gamma=0.99,
                               noptepochs=4,
                               ent_coef=.01,
                               learning_rate=lambda f: f * 2.5e-4,
                               cliprange=lambda f: f * 0.1)
        else:
            model = self.model(self.policy,
                               env,
                               verbose=self.verbose,
                               tensorboard_log=tensorboard_log)

        self.strat_time = time.time()
        print("start to train agent...")

        callback = None
        if self.evaluate_freq is not None and self.evaluate_freq > 0:
            preiod_eval_path = os.path.join(self.save_path, "period_eval")
            mkdirs(preiod_eval_path)
            if "cfg" in self.env_id:

                eval_env = make_doom_env(self.env_id,
                                         self.num_cpu,
                                         self.seed,
                                         extra_wrapper_func=env_creator,
                                         logdir=monitor_path,
                                         wrapper_kwargs={
                                             "episode_life": False,
                                             "clip_rewards": False
                                         })
            else:
                eval_env = VecFrameStack(
                    make_atari_env(self.env_id,
                                   self.num_cpu,
                                   self.seed,
                                   extra_wrapper_func=env_creator,
                                   logdir=preiod_eval_path,
                                   wrapper_kwargs={
                                       "episode_life": False,
                                       "clip_rewards": False
                                   }), 4)
            callback = self.eval_callback(eval_env,
                                          freq=self.evaluate_freq,
                                          eval_times=eval_times,
                                          eval_max_steps=eval_max_steps,
                                          save_path=preiod_eval_path)

        model.learn(total_timesteps=train_total_timesteps,
                    reset_num_timesteps=self.reset_num_timesteps,
                    callback=callback)
        print("Finish train agent")

        #evaluate once more because sometimes it is not divisible
        if callback is not None:
            callback({"self": model, "eval_now": True}, None)

        if self.save_path is not None:
            if self.preserve_model > 0:

                self.save_model(model, skills=skills)

        env.close()
        # evaluate
        env = VecFrameStack(
            make_atari_env(self.env_id,
                           self.num_cpu,
                           self.seed,
                           extra_wrapper_func=env_creator,
                           logdir=None), 4)
        info = self.evaluate(env, model, eval_times, eval_max_steps)
        try:
            env.close()
        except AttributeError as e:
            print("Ignore : {}".format(e))
        try:
            del model
        except AttributeError as e:
            print("Ignore del model : {}".format(e))

        #log result
        info.update(add_info)
        self.log(info)

        self._serial_num = self._serial_num + 1
        return info["ave_score"], info["ave_action_reward"]