def test_lstm_example(): import tensorflow as tf from baselines_merl.common import policies, models, cmd_util from baselines_merl.common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def test_microbatches(): def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0) env_ref = DummyVecEnv([env_fn]) sess_ref = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_ref) vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()} env_test = DummyVecEnv([env_fn]) sess_test = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2)) # learn_fn(env=env_test) vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()} for v in vars_ref: np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): def seeded_env_fn(): env = env_fn() env.seed(0) return env np.random.seed(0) env = DummyVecEnv([seeded_env_fn]) with tf.Graph().as_default(), tf.Session(config=_sess_config).as_default(): tf.set_random_seed(0) model = learn_fn(env) sum_rew = 0 done = True for i in range(n_trials): if done: obs = env.reset() state = model.initial_state if state is not None: a, v, state, _ = model.step(obs, S=state, M=[False]) else: a, v, _, _ = model.step(obs) obs, rew, done, _ = env.step(a) sum_rew += float(rew) print("Reward in {} trials is {}".format(n_trials, sum_rew)) assert sum_rew > min_reward_fraction * n_trials, \ 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format(k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): env = DummyVecEnv([env_fn]) with tf.Graph().as_default(), tf.Session(config=_sess_config).as_default(): model = learn_fn(env) N_TRIALS = 100 observations, actions, rewards = rollout(env, model, N_TRIALS) rewards = [sum(r) for r in rewards] avg_rew = sum(rewards) / N_TRIALS print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) assert avg_rew > min_avg_reward, \ 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations= flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv( [make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])