def test_lstm_example(): import tensorflow as tf from common import policies, models, cmd_util from common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv( [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def test_microbatches(): def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0) env_ref = DummyVecEnv([env_fn]) sess_ref = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_ref) vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()} env_test = DummyVecEnv([env_fn]) sess_test = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2)) # learn_fn(env=env_test) vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()} for v in vars_ref: np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in [ 'acer', 'acktr', 'trpo_mpi', 'deepq' ]: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose( v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format( k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None, cloth_cfg_path=None, render_path=None, start_state_path=None): """Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. Daniel: the above docs from baselines seems out of date, ALL types go here? Also, we're adding arguments for the cloth env: the config path, the render path, and the starting state path (last one is optional for the cloth). """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank, cloth_cfg_path=None, render_path=None, start_state_path=None): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir, cloth_cfg_path=cloth_cfg_path, render_path=render_path, start_state_path=start_state_path, ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([ make_thunk( i + start_index, cloth_cfg_path=cloth_cfg_path, render_path=None, # Daniel: for now start_state_path=start_state_path) for i in range(num_env) ]) else: return DummyVecEnv([ make_thunk(start_index, cloth_cfg_path, render_path=render_path, start_state_path=start_state_path) ])
def make_mujoco_env(env_id, seed, normalize=False, training=True): def make_env(): env = gym.make(env_id) env.seed(seed) return env env = DummyVecEnv([make_env]) np.random.seed(seed) torch.manual_seed(seed) if normalize: env = VecNormalize(env, training=training) return env
def make_vec_envs(evaluation): def env_thunk(rank): return lambda: self.make_env(seed=int(seed), rank=rank, evaluation=evaluation, env_id=env_id) env_fns = [env_thunk(i) for i in range(num_processes)] use_dummy = len( env_fns) == 1 or sys.platform == "darwin" or synchronous return VecPyTorch( DummyVecEnv(env_fns, render=render ) if use_dummy else SubprocVecEnv(env_fns))
def make_vec_envs( self, num_processes, gamma, render, synchronous, env_id, add_timestep, seed, evaluation, time_limit, num_frame_stack=None, **env_args, ): envs = [ functools.partial( # thunk self.make_env, rank=i, env_id=env_id, add_timestep=add_timestep, seed=seed, evaluation=evaluation, time_limit=time_limit, evaluating=evaluation, **env_args, ) for i in range(num_processes) ] if len(envs) == 1 or sys.platform == "darwin" or synchronous: envs = DummyVecEnv(envs, render=render) else: envs = SubprocVecEnv(envs) # if ( # envs.observation_space.shape # and len(envs.observation_space.shape) == 1 # ): # if gamma is None: # envs = VecNormalize(envs, ret=False) # else: # envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack) # elif len(envs.observation_space.shape) == 3: # envs = VecPyTorchFrameStack(envs, 4, device) return envs
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in [ 'acktr', 'trpo_mpi', 'deepq' ]: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) if env_type == 'atari' else gym.make( env_id) env.seed(seed + 10000 * mpi_rank + rank if seed is not None else None) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)), allow_early_resets=True) if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs) elif reward_scale != 1: return RewardScaler(env, reward_scale) else: return env return _thunk set_global_seeds(seed) if num_env > 1: return SubprocVecEnv( [make_env(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_env(start_index)])
def main(): parser = arg_parser() parser.add_argument('--platform', help='environment choice', choices=['atari', 'mujoco', 'humanoid', 'robotics'], default='atari') platform_args, environ_args = parser.parse_known_args() platform = platform_args.platform logger.configure() # atari if platform == 'atari': parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') args = parser.parse_known_args()[0] # fit( # args.env, # num_timesteps=args.num_timesteps, # seed=args.seed, # policy=args.policy # ) sess = Agent().init_session().__enter__() env = VecFrameStack(make_atari_env(args.env, 8, args.seed), 4) policy = {'cnn' : Convnet, 'lstm' : Lstm, 'lnlstm' : LnLstm, 'mlp': Mlp}[args.policy] fit( policy=policy, env=env, nsteps=128, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(args.num_timesteps * 1.1) ) sess.close() env.close() del sess # mujoco if platform == 'mujoco': args = mujoco_arg_parser().parse_known_args()[0] sess = Agent().init_session().__enter__() from utils.monitor import Monitor def make_env(): env = make_mujoco_env(args.env, args.seed) # env = gym.make(env_id) env = Monitor(env, logger.get_dir(), allow_early_resets=True) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) model = fit( policy=Mlp, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=args.num_timesteps ) # return model, env if args.play: logger.log("Running trained model") obs = np.zeros((env.num_envs,) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:] = env.step(actions)[0] env.render() sess.close() env.close() del sess