def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env.seed(seed) return env
def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() myseed = seed + 1000 * rank if seed is not None else None set_global_seeds(myseed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) if reward_scale != 1.0: from common.retro_wrappers import RewardScaler env = RewardScaler(env, reward_scale) return env
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, logger_dir=None): wrapper_kwargs = wrapper_kwargs or {} if env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) else: env = gym.make(env_id) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return wrap_deepmind(env, **wrapper_kwargs)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = Monitor(env, logger.get_dir()) env = wrap_deepmind(env) model = cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) fit( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close() sess = tf.get_default_session() del sess
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: import re import importlib module_name = re.sub(':.*', '', env_id) env_id = re.sub('.*:', '', env_id) importlib.import_module(module_name) env = gym.make(env_id, **env_kwargs) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env = gym.make(env_id, **env_kwargs) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if isinstance(env.action_space, gym.spaces.Box): env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, logger_dir=None, cloth_cfg_path=None, render_path=None, start_state_path=None): """Daniel: make single instance of env, to be wrapped in VecEnv for parallelism. We need to have a special if case for the clothenv, which doesn't actually use `gym.make(...)` because we have a custom configuration. """ wrapper_kwargs = wrapper_kwargs or {} if env_type == 'cloth': print("Env Type is Cloth") assert cloth_cfg_path is not None from gym_cloth.envs import ClothEnv env = ClothEnv(cloth_cfg_path, subrank=subrank, start_state_path=start_state_path) print('Created ClothEnv, seed {}, mpi_rank {}, subrank {}.'.format( seed, mpi_rank, subrank)) print('start_state_path: {}'.format(start_state_path)) # Daniel: render, but currently only works if we have one env, not a vec ... if render_path is not None: env.render(filepath=render_path) elif env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro( game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) else: print("USING WRONG COMMAND") env = gym.make(env_id) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) #Adi: Let's return the actual env for now instead of the wrapped version for simplicity. Can change this back later. env = env.unwrapped return env
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: import re import importlib module_name = re.sub(':.*', '', env_id) env_id = re.sub('.*:', '', env_id) importlib.import_module(module_name) if env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro( game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) else: env = gym.make(env_id, **env_kwargs) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): env = FlattenObservation(env) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if isinstance(env.action_space, gym.spaces.Box): env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def fit(environ, env_id, num_timesteps, seed, model_path=None): # atari if environ == 'atari': rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \ is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pi = PPOSGD(env, policy_fn, env.observation_space, env.action_space, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, max_timesteps=int(num_timesteps * 1.1), schedule='linear') env.close() sess.close() return pi # mujoco if environ == 'mujoco': from utils.cmd import make_mujoco_env sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi if environ == 'humanoid': import gym from utils.cmd import make_mujoco_env env_id = 'Humanoid-v2' class RewScale(gym.RewardWrapper): def __init__(self, env, scale): gym.RewardWrapper.__init__(self, env) self.scale = scale def reward(self, r): return r * self.scale sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random # search these are good enough to make humanoid walk, but # whether those are an absolute best or not is not certain env = RewScale(env, 0.1) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: Model().save_state(model_path) sess.close() return pi if environ == 'robotics': import mujoco_py from utils.cmd import make_robotics_env rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi
def main(): parser = arg_parser() parser.add_argument('--platform', help='environment choice', choices=['atari', 'mujoco'], default='atari') platform_args, environ_args = parser.parse_known_args() platform = platform_args.platform rank = MPI.COMM_WORLD.Get_rank() # atari if platform == 'atari': from bench import Monitor from utils.cmd import atari_arg_parser, make_atari, \ wrap_deepmind from policies.nohashingcnn import CnnPolicy args = atari_arg_parser().parse_known_args()[0] if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = args.seed + 10000 * rank set_global_seeds(workerseed) env = make_atari(args.env) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env.observation_space, env.action_space) sess = model.single_threaded_session().__enter__() # model.reset_graph_and_vars() model.init_vars() fit(model, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=int(args.num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) sess.close() env.close() # mujoco if platform == 'mujoco': from policies.ppo1mlp import PPO1Mlp from utils.cmd import make_mujoco_env, mujoco_arg_parser args = mujoco_arg_parser().parse_known_args()[0] if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * rank env = make_mujoco_env(args.env, workerseed) def policy(name, observation_space, action_space): return PPO1Mlp(name, env.observation_space, env.action_space, hid_size=32, num_hid_layers=2) model = TRPO(policy, env.observation_space, env.action_space) sess = model.single_threaded_session().__enter__() model.init_vars() fit(model, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) sess.close() env.close()