def stable_baseline_training(algorithm, steps, number_maps, random_spawn, random_textures, lstm, random_keys, keys, dimensions, num_cpus, n_stack, clip, complexity, density, mp, eval_occurrence, eval_episodes, eval, experiment_name, env_seed, alg_seed, episode_timeout): """ Runs OpenAI stable baselines implementation of specified algorithm on specified environment with specified training configurations. Note: For scenarios not using MazeExplorer but using the vizdoom wrapper, the .T transpose on the array being fed into the process image method needs to be removed. Note: Ensure relevant maps are in specified paths under mazes folder for simpler and manual scenarios. :param algorithm: which algorithm to run (currently support for PPO and A2C) :param steps: number of steps to run training :param number_maps: number of maps to generate and train on :param random_spawn: whether or not to randomise the spawn position of the agent :param random_textures: whether or not to randomise textures in generated maps :param lstm: whether or not to add an lstm to the network :param random_keys: whether to randmise key placement upon each new training episode in a given map :param keys: number of keys to place in generated maps :param dimensions: x, y dimensions of maps to be generated :param num_cpus: number of environments in which to train :param n_stack: number of frames to stack to feed as a state to the agent :param clip: whether or not to clip rewards from the environment :param complexity: float between 0 and 1 describing the complexity of the generated mazes :param density: float between 0 and 1 describing the density of the generated mazes :param mp: whether or not to use multiprocessing for workers :param eval_occurrence: parameter specifying period of running evaluation :param eval_episodes: number of times to perform episode rollout during evaluation :param eval: whether or not to use evaluation during training :param experiment_name: name of experiment for use in logging and file saving :param env_seed: seed to be used for environment generation :param alg_seed: seed to be used for stable-baseline algorithms :param episode_timeout: number of steps after which to terminate episode """ timestamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H-%M-%S') # generate a file in log directory containing training configuration information. experiment_name = experiment_name + "/" if experiment_name else "" OUTPUT_PATH = os.path.join(DIR_PATH, 'results', experiment_name, timestamp) os.makedirs(OUTPUT_PATH, exist_ok=True) with open(os.path.join(OUTPUT_PATH, 'params.txt'), 'w+') as f: f.write( str({ 'algorithm': algorithm, 'number_maps': number_maps, 'random_spawn': random_spawn, 'random_textures': random_textures, 'lstm': lstm, 'random_keys': random_keys, 'keys': keys, 'dimensions': dimensions, 'num_cpus': num_cpus, 'clip': clip, 'mp': mp, 'n_stack': n_stack, 'env_seed': env_seed, 'alg_seed': alg_seed, 'experiment_name': experiment_name, "eval_occurrence": eval_occurrence, "eval_episodes": eval_episodes, "episode_timeout": episode_timeout })) if clip: clip_range = (-1, 1) else: clip_range = False mazeexplorer_env = MazeExplorer(number_maps=number_maps, random_spawn=random_spawn, random_textures=random_textures, random_key_positions=random_keys, keys=keys, size=dimensions, clip=clip_range, seed=env_seed, complexity=complexity, density=density) if mp: env = SubprocVecEnv( [mazeexplorer_env.create_env() for _ in range(num_cpus)]) else: env = DummyVecEnv([ mazeexplorer_env.create_env() for _ in range(num_cpus) ]) # vectorise env if n_stack > 0: env = VecFrameStack(env, n_stack=n_stack) if algorithm == 'ppo': algo = PPO2 elif algorithm == 'a2c': algo = A2C else: raise NotImplementedError("Only supports PPO and A2C") if lstm: model = algo(CnnLstmPolicy, env, verbose=1, tensorboard_log=OUTPUT_PATH) else: model = algo(CnnPolicy, env, verbose=1, tensorboard_log=OUTPUT_PATH) if eval: evaluator = Evaluator(os.path.join(DIR_PATH, "eval_maps"), OUTPUT_PATH, num_cpus, mp, n_stack) steps_taken = 0 print("Training started...") while steps_taken < steps: print("Training...") model.learn(total_timesteps=min(eval_occurrence, (steps - steps_taken)), reset_num_timesteps=False, seed=alg_seed) steps_taken += eval_occurrence print("Evaluating...") evaluator.evaluate(model, steps_taken, eval_episodes, save=True) # do 100 rollouts and save scores print("Training completed.") else: model.learn(total_timesteps=steps, seed=alg_seed)
import pacman import gym import numpy as np import time from stable_baselines.common.policies import MlpPolicy, CnnPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2, DQN from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack # env = gym.make('MsPacman-v0') env = make_atari_env('MsPacmanNoFrameskip-v0', num_env=1, seed=0) # env = DummyVecEnv([lambda: env]) env = VecFrameStack(env, n_stack=4) model = PPO2(CnnPolicy, env, verbose=1, vf_coef=1, n_steps=128, tensorboard_log="./logs/baseline_MDP") eval_scores = [] for ep in range(1): obs = env.reset() total_reward = 0 done = False while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action)
def init(): env = make_atari_env(id, num_env=num_env, seed=seed) env = VecFrameStack(env, n_stack=n_stack) Globals.env = RewardWrapper(env)
tensorboard_folder = './tensorboard/Pacman/action_mask/' model_folder = './models/Pacman/base/' if not os.path.isdir(tensorboard_folder): os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] if __name__ == '__main__': env = DummyVecEnv([lambda: ActionMaskEnv() for i in range(4)]) env = VecFrameStack(env, 3) model = PPO2.load(model_folder + "PPO2" + model_tag) done = [False, False, False, False] states = None action_masks = [] obs = env.reset() while not done[0]: action, states = model.predict(obs, states, action_mask=action_masks) obs, _, done, infos = env.step(action) env.render() action_masks.clear() for info in infos: env_action_mask = info.get('action_mask')
def create_test_env(env_id, n_envs=1, n_agents=1, is_atari=False, stats_path=None, norm_reward=False, seed=0, log_dir='', should_render=True): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param n_agents: (int) number of agents per enviroment :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param norm_reward: (bool) Whether to normalize rewards or not when using Vecnormalize :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() # Create the environment and wrap it if necessary if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: env = SubprocVecEnv([make_env(env_id, i, seed, log_dir) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument use_subproc = 'renders' not in inspect.getfullargspec(class_.__init__).args # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, renders=should_render) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir)]) else: env = DummyVecEnv([_init]) elif 'Marathon' in env_id: from UnityVecEnv import UnityVecEnv # env = UnityVecEnv(env_id, inference_mode=True) from gym_unity.envs import UnityEnv env_path = UnityVecEnv.GetFilePath(env_id, inference_mode=True) env = UnityEnv(env_path) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if os.path.join(stats_path, 'obs_rms.pkl'): print("Loading running average") env = VecNormalize(env, training=False, norm_reward=norm_reward) env.load_running_average(stats_path) n_stack_file = os.path.join(stats_path, 'n_stack') if os.path.isfile(n_stack_file): with open(n_stack_file, 'r') as f: n_stack = int(f.read()) print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() if hyperparams is None: hyperparams = {} # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: spec = gym.envs.registry.env_specs[env_id] try: class_ = load(spec.entry_point) except AttributeError: # Backward compatibility with gym class_ = load(spec._entry_point) # HACK: force SubprocVecEnv for Bullet env that does not # have a render argument render_name = None use_subproc = 'renders' not in inspect.getfullargspec( class_.__init__).args if not use_subproc: render_name = 'renders' # Dev branch of pybullet # use_subproc = use_subproc and 'render' not in inspect.getfullargspec(class_.__init__).args # if not use_subproc and render_name is None: # render_name = 'render' # Create the env, with the original kwargs, and the new ones overriding them if needed def _init(): # TODO: fix for pybullet locomotion envs env = class_(**{**spec._kwargs}, **{render_name: should_render}) env.seed(0) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, "0"), allow_early_resets=True) return env if use_subproc: env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper) ]) else: env = DummyVecEnv([_init]) else: env = DummyVecEnv( [make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
parser.add_argument('--val-interval', type=int, default=100) parser.add_argument('--val-episodes', type=int, default=1) parser.add_argument('--num-epochs', type=int, default=50) args = parser.parse_args() set_global_seeds(args.seed) logger.configure(os.path.join('logs', args.env, args.note)) dataset = ExpertDataset(expert_path=args.expert, batch_size=128, train_fraction=0.99, verbose=1) if 'NoFrameskip' in args.env: env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) else: import gym env = gym.make(args.env) model = PPO2(args.policy, env, verbose=1) # Pretrain the PPO2 model # Data should be abundant, so train only one epoch model.pretrain(dataset, peer=args.peer, val_interval=args.val_interval, val_episodes=args.val_episodes, n_epochs=args.num_epochs) # As an option, you can train the RL agent
if isinstance(normalize, str): normalize_kwargs = eval(normalize) normalize = True del hyperparams['normalize'] # Delete keys so the dict can be pass to the model constructor if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] # Create the environment and wrap it if necessary if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif args.algo in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id) env.seed(args.seed) else: if n_envs == 1: env = DummyVecEnv([make_env(env_id, 0, args.seed)]) else: env = SubprocVecEnv( [make_env(env_id, i, args.seed) for i in range(n_envs)]) if normalize: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs)
def snake_wrapper(env, stack_length=3): env = DummyVecEnv([lambda: env]) env = VecFrameStack(env, stack_length) return env
def create_env(): """Creation of MsPacman environment""" env = make_atari_env("MsPacmanNoFrameskip-v0", num_env=16, seed=817) env = VecFrameStack(env, n_stack=4) return env
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path # Set initialzier and action type for environment, standard implementation currently does not support # custom types, so pass them here (kwargs is global, so do set again during repeated calls) if "initializer" in env_kwargs.keys() and isinstance( env_kwargs["initializer"], int): if env_kwargs["initializer"] == 0: env_kwargs["initializer"] = RandomInitializer( env_kwargs.pop("difficulty")) elif env_kwargs["initializer"] == 1: env_kwargs["initializer"] = CompletelyRandomInitializer() else: raise RuntimeError('Unsupported initializer "{}"'.format( env_kwargs["initializer"])) if "action_type" in env_kwargs.keys() and isinstance( env_kwargs["action_type"], int): if env_kwargs["action_type"] == "POSITION": env_kwargs["action_type"] = ActionType.POSITION elif env_kwargs["action_type"] == "TORQUE": env_kwargs["action_type"] = ActionType.TORQUE elif env_kwargs["action_type"] == "TORQUE_AND_POSITION": env_kwargs["action_type"] = ActionType.TORQUE_AND_POSITION else: raise RuntimeError('Unsupported Action Type"{}"'.format( kwargs["action_type"])) else: env_kwargs["action_type"] = ActionType.POSITION if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id, **env_kwargs) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs['norm_reward'] = False else: local_normalize_kwargs = {'norm_reward': False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print("Normalization activated: {}".format( local_normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) if args.algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(env, VecEnv): env = _UnvecWrapper(env) env = HERGoalEnvWrapper(env) return env
import argparse import numpy as np from stable_baselines import PPO2, logger from stable_baselines.common.cmd_util import make_atari_env parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, help='Expert path (*.zip)') parser.add_argument('--seed', type=int, default=0, help='Random seed for env.') parser.add_argument('--note', type=str, default='test', help='Logging directory') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4', help='Environment ID') args = parser.parse_args() logdir = os.path.join('logs', args.env, args.note) logger.configure(logdir) logger.info(args) env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) model = PPO2.load(args.expert) generate_expert_traj(model, save_path=os.path.join(logdir, 'expert'), env=env)
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') torch.cuda.empty_cache() seq_len = 4 batch_size = 64 num_envs = 32 seed = 0 n = 8 torch.manual_seed(seed) np.random.seed(seed) env = make_atari_env('BreakoutNoFrameskip-v4', num_env=num_envs, seed=seed) env = VecFrameStack(env, n_stack=seq_len) filename = "Breakout2" class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.conv1 = nn.Conv2d(seq_len, 16, 8, stride=4) self.conv2 = nn.Conv2d(16, 32, 4, stride=2) self.lstm1 = nn.Linear(2592, 512) self.lin1 = nn.Linear(512, 128) self.q = nn.Linear(128, 1) self.lin2 = nn.Linear(128, 4) self.training_hidden = ( torch.zeros((num_envs, seq_len, 128)).to(device), torch.zeros((num_envs, seq_len, 128)).to(device))
def _create_vectorized_env(env_id, env_kwargs, n_envs, multiprocessing, seed, log_dir, wrappers, normalize, frame_stack, video_path, evaluation, scale, curiosity, buffer_step_data, algorithm_name): if n_envs == 1: env = DummyVecEnv([ make_env(env_id, env_kwargs, 0, seed, log_dir, wrappers, evaluation=evaluation) ]) else: if multiprocessing: env = SubprocVecEnv([ make_env(env_id, env_kwargs, i, seed, log_dir, wrappers, evaluation=evaluation) for i in range(n_envs) ]) else: env = DummyVecEnv([ make_env(env_id, env_kwargs, i, seed, log_dir, wrappers, evaluation=evaluation) for i in range(n_envs) ]) if video_path: env = VecImageRecorder(env, video_path, record_obs=True) if evaluation: env = VecEvaluationWrapper(env) # Add normalization wrapper for all algorithms except dqn here to save computations before frame stack if normalize and "dqn" not in algorithm_name: env = _add_normalization_wrapper(env, n_envs, normalize) if curiosity: if isinstance(curiosity, bool): env = CuriosityWrapper(env) else: if 'trained_agent' in curiosity: path = curiosity.pop('trained_agent') env = CuriosityWrapper.load(path, env, **curiosity) if len(env.int_rwd_rms.mean) != n_envs: logging.warning( "Skipping loading of curiosity wrapper due to a mismatch in numbers of environments ({} vs {})" .format(len(env.int_ret), n_envs)) env = env.venv else: env = CuriosityWrapper(env, **curiosity) if scale: if isinstance(scale, dict): env = VecScaledFloatFrame(env, **scale) else: env = VecScaledFloatFrame(env) if frame_stack: env = VecFrameStack(env, **frame_stack) # Add normalization wrapper here to include frame stack when training with dqn. if normalize and "dqn" in algorithm_name: env = _add_normalization_wrapper(env, n_envs, normalize) if buffer_step_data: env = VecStepSave(env) return env
normalize_kwargs = eval(normalize) normalize = True del hyperparams['normalize'] # Delete keys so the dict can be pass to the model constructor if 'n_envs' in hyperparams.keys(): del hyperparams['n_envs'] del hyperparams['n_timesteps'] ############### Create the environment and wrap it if necessary if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) if not args.no_monitor: print("WARNING: monitor is not supported yet for atari env") elif args.algo in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id) if len(env_params) > 0: env = modify_env_params(env, params_path, **env_params) elif len(params_ranges) > 0: env = RandomUniformEnvParams(env, params_path, params_ranges) env.seed(args.seed) if not args.no_monitor: env = Monitor(env, monitor_log, allow_early_resets=True) else: if n_envs == 1:
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from agent import ICMAgent from runner import Runner from utils import get_args # constants if __name__ == '__main__': """Argument parsing""" args = get_args() """Environment""" # create the atari environments # NOTE: this wrapper automatically resets each env if the episode is done env = make_atari_env(args.env_name, num_env=args.num_envs, seed=args.seed) env = VecFrameStack(env, n_stack=args.n_stack) """Agent""" agent = ICMAgent(args.n_stack, args.num_envs, env.action_space.n, lr=args.lr) """Train""" runner = Runner(agent, env, args.num_envs, args.n_stack, args.rollout_size, args.num_updates, args.max_grad_norm, args.value_coeff, args.entropy_coeff, args.tensorboard, args.log_dir, args.cuda, args.seed) runner.train()
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) :return: (gym.Env) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if is_atari: if args.verbose > 0: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=args.seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif algo_ in ['dqn', 'ddpg']: if hyperparams.get('normalize', False): print("WARNING: normalization not supported yet for DDPG/DQN") env = gym.make(env_id, **env_kwargs) env.seed(args.seed) if env_wrapper is not None: env = env_wrapper(env) else: # hacky way to get multiple gui outputs in test environments if "Test" in env_id: if n_envs == 1: env = SubprocVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = SubprocVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) else: if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print("Normalization activated: {}".format( normalize_kwargs)) else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] return env
def create_test_env(env_id, n_envs=1, is_atari=False, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param is_atari: (bool) :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param env_wrapper: (type) A subclass of gym.Wrapper to wrap the original env with :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :return: (gym.Env) """ # HACK to save logs if log_dir is not None: os.environ["OPENAI_LOG_FORMAT"] = 'csv' os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) os.makedirs(log_dir, exist_ok=True) logger.configure() if hyperparams is None: hyperparams = {} # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if is_atari: print("Using Atari wrapper") env = make_atari_env(env_id, num_env=n_envs, seed=seed) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) elif n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper) for i in range(n_envs)]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) else: env = DummyVecEnv([make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load(os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
env = make_env(args.seed, vae=vae, teleop=args.teleop, n_stack=hyperparams.get('frame_stack', 1))() if normalize: if hyperparams.get('normalize', False) and args.algo in ['ddpg']: print("WARNING: normalization not supported yet for DDPG") else: print("Normalizing input and return") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking n_stack = 1 if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] if not args.teleop: env = VecFrameStack(env, n_stack) print("Stacking {} frames".format(n_stack)) del hyperparams['frame_stack'] # Parse noise string for DDPG if args.algo == 'ddpg' and hyperparams.get('noise_type') is not None: noise_type = hyperparams['noise_type'].strip() noise_std = hyperparams['noise_std'] n_actions = env.action_space.shape[0] if 'adaptive-param' in noise_type: hyperparams['param_noise'] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type:
def get_rewards(self, skills=[], train_total_timesteps=5000000, eval_times=100, eval_max_steps=int(1e6), model_save_name=None, add_info={}): """ :param skills: (list) the availiable action sequence for agent e.g [[0,2,2],[0,1,1]] :param train_total_timesteps: (int)total_timesteps to train :param eval_times: (int)the evaluation times e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode :param eval_max_steps: (int)maximum timesteps per episode when evaluate (deprecate):param model_save_name: (str)specify the name of saved model (should not repeat) :param add_info: (dict) other information to log in log.txt """ if self.save_tensorboard and self.save_path is not None: tensorboard_log = os.path.join(self.save_path, "model_" + str(self._serial_num)) else: tensorboard_log = None env_creator = lambda env: SkillWrapper( self.env_creator(env), skills=skills, gamma=self.gamma) if self.save_monitor is True: monitor_path = os.path.join(self.save_path, "monitor") try: os.makedirs(monitor_path) except OSError as ex: if ex.errno == errno.EEXIST and os.path.exists(monitor_path): print("{} exists. ignore".format(monitor_path)) pass else: raise else: monitor_path = None if "cfg" in self.env_id: env = make_doom_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=monitor_path) else: env = VecFrameStack( make_atari_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=monitor_path), 4) model = None if self.use_converge_parameter is True: model = self.model(self.policy, env, verbose=self.verbose, tensorboard_log=tensorboard_log, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1) else: model = self.model(self.policy, env, verbose=self.verbose, tensorboard_log=tensorboard_log) self.strat_time = time.time() print("start to train agent...") callback = None if self.evaluate_freq is not None and self.evaluate_freq > 0: preiod_eval_path = os.path.join(self.save_path, "period_eval") mkdirs(preiod_eval_path) if "cfg" in self.env_id: eval_env = make_doom_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=monitor_path, wrapper_kwargs={ "episode_life": False, "clip_rewards": False }) else: eval_env = VecFrameStack( make_atari_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=preiod_eval_path, wrapper_kwargs={ "episode_life": False, "clip_rewards": False }), 4) callback = self.eval_callback(eval_env, freq=self.evaluate_freq, eval_times=eval_times, eval_max_steps=eval_max_steps, save_path=preiod_eval_path) model.learn(total_timesteps=train_total_timesteps, reset_num_timesteps=self.reset_num_timesteps, callback=callback) print("Finish train agent") #evaluate once more because sometimes it is not divisible if callback is not None: callback({"self": model, "eval_now": True}, None) if self.save_path is not None: if self.preserve_model > 0: self.save_model(model, skills=skills) env.close() # evaluate env = VecFrameStack( make_atari_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=None), 4) info = self.evaluate(env, model, eval_times, eval_max_steps) try: env.close() except AttributeError as e: print("Ignore : {}".format(e)) try: del model except AttributeError as e: print("Ignore del model : {}".format(e)) #log result info.update(add_info) self.log(info) self._serial_num = self._serial_num + 1 return info["ave_score"], info["ave_action_reward"]