def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env.seed(seed) return env
def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() myseed = seed + 1000 * rank if seed is not None else None set_global_seeds(myseed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) if reward_scale != 1.0: from common.retro_wrappers import RewardScaler env = RewardScaler(env, reward_scale) return env
def fit(environ, env_id, num_timesteps, seed, model_path=None): # atari if environ == 'atari': rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \ is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pi = PPOSGD(env, policy_fn, env.observation_space, env.action_space, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, max_timesteps=int(num_timesteps * 1.1), schedule='linear') env.close() sess.close() return pi # mujoco if environ == 'mujoco': from utils.cmd import make_mujoco_env sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi if environ == 'humanoid': import gym from utils.cmd import make_mujoco_env env_id = 'Humanoid-v2' class RewScale(gym.RewardWrapper): def __init__(self, env, scale): gym.RewardWrapper.__init__(self, env) self.scale = scale def reward(self, r): return r * self.scale sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random # search these are good enough to make humanoid walk, but # whether those are an absolute best or not is not certain env = RewScale(env, 0.1) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: Model().save_state(model_path) sess.close() return pi if environ == 'robotics': import mujoco_py from utils.cmd import make_robotics_env rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi
def main(): parser = arg_parser() parser.add_argument('--platform', help='environment choice', choices=['atari', 'mujoco'], default='atari') platform_args, environ_args = parser.parse_known_args() platform = platform_args.platform rank = MPI.COMM_WORLD.Get_rank() # atari if platform == 'atari': from bench import Monitor from utils.cmd import atari_arg_parser, make_atari, \ wrap_deepmind from policies.nohashingcnn import CnnPolicy args = atari_arg_parser().parse_known_args()[0] if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = args.seed + 10000 * rank set_global_seeds(workerseed) env = make_atari(args.env) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env.observation_space, env.action_space) sess = model.single_threaded_session().__enter__() # model.reset_graph_and_vars() model.init_vars() fit(model, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=int(args.num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) sess.close() env.close() # mujoco if platform == 'mujoco': from policies.ppo1mlp import PPO1Mlp from utils.cmd import make_mujoco_env, mujoco_arg_parser args = mujoco_arg_parser().parse_known_args()[0] if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * rank env = make_mujoco_env(args.env, workerseed) def policy(name, observation_space, action_space): return PPO1Mlp(name, env.observation_space, env.action_space, hid_size=32, num_hid_layers=2) model = TRPO(policy, env.observation_space, env.action_space) sess = model.single_threaded_session().__enter__() model.init_vars() fit(model, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) sess.close() env.close()