def test_get_qval_max_pooling(self, filters, strides, pool_strides, pool_shapes): env = MetaRLEnv(DummyDiscretePixelEnv()) obs = env.reset() with mock.patch(('metarl.tf.models.' 'cnn_mlp_merge_model.CNNModelWithMaxPooling'), new=SimpleCNNModelWithMaxPooling): with mock.patch(('metarl.tf.models.' 'cnn_mlp_merge_model.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousCNNQFunction(env_spec=env.spec, filters=filters, strides=strides, max_pooling=True, pool_strides=pool_strides, pool_shapes=pool_shapes) action_dim = env.action_space.shape obs, _, _, _ = env.step(1) act = np.full(action_dim, 0.5) expected_output = np.full((1, ), 0.5) outputs = qf.get_qval([obs], [act]) assert np.array_equal(outputs[0], expected_output) outputs = qf.get_qval([obs, obs, obs], [act, act, act]) for output in outputs: assert np.array_equal(output, expected_output)
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=5, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def test_is_pickleable(self, obs_dim, action_dim): """Test if ContinuousMLPPolicy is pickleable""" env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'continuous_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) with tf.compat.v1.variable_scope('ContinuousMLPPolicy/MLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs, feed_dict={policy.model.input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.outputs, feed_dict={policy_pickled.model.input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def test_get_qval_sym(self, obs_dim, action_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() output1 = qf.get_qval([obs], [act]) input_var1 = tf.compat.v1.placeholder(tf.float32, shape=(None, obs.shape[0])) input_var2 = tf.compat.v1.placeholder(tf.float32, shape=(None, act.shape[0])) q_vals = qf.get_qval_sym(input_var1, input_var2, 'another') output2 = self.sess.run(q_vals, feed_dict={ input_var1: [obs], input_var2: [act] }) expected_output = np.full((1, ), 0.5) assert np.array_equal(output1, output2) assert np.array_equal(output2[0], expected_output)
def setup_method(self): super().setup_method() self.env = MetaRLEnv(DummyBoxEnv()) self.obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, self.env.observation_space.flat_dim], name='obs')
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = MetaRLEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = MetaRLEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def test_get_action_state_include_action(self, obs_dim, action_dim, hidden_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[ None, None, env.observation_space.flat_dim + np.prod(action_dim) ], name='obs') policy = GaussianGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.build(obs_var) policy.reset() obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) policy.reset() actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_maml_trpo_pendulum(): """Test PPO with Pendulum environment.""" env = MetaRLEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32)) rollouts_per_task = 5 max_path_length = 100 runner = LocalRunner(snapshot_config) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=5, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5 env.close()
def test_trpo_cnn_cubecrash(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline( env_spec=env.spec, regressor_args=dict(filters=((32, (8, 8)), (64, (4, 4))), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, max_kl_step=0.01, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > -1.5 env.close()
def setup_method(self): self.env = MetaRLEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock(env_spec=self.env.spec, policy=self.policy, max_path_length=16)
def test_is_pickleable(self, obs_dim, action_dim): env = MetaRLEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = CategoricalMLPPolicy(env_spec=env.spec) policy.build(obs_var) obs = env.reset() with tf.compat.v1.variable_scope( 'CategoricalMLPPolicy/CategoricalMLPModel', reuse=True): bias = tf.compat.v1.get_variable('mlp/hidden_0/bias') # assign it to all one bias.load(tf.ones_like(bias).eval()) output1 = self.sess.run( [policy.distribution.probs], feed_dict={policy.model.input: [[obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy_pickled.build(obs_var) output2 = sess.run( [policy_pickled.distribution.probs], feed_dict={policy_pickled.model.input: [[obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_ppo_pendulum_recurrent_continuous_baseline(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
class TestQfDerivedPolicy(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = MetaRLEnv(DummyDiscreteEnv()) self.qf = SimpleQFunction(self.env.spec) self.policy = DiscreteQfDerivedPolicy(env_spec=self.env.spec, qf=self.qf) self.sess.run(tf.compat.v1.global_variables_initializer()) self.env.reset() def test_discrete_qf_derived_policy(self): obs, _, _, _ = self.env.step(1) action, _ = self.policy.get_action(obs) assert self.env.action_space.contains(action) actions, _ = self.policy.get_actions([obs]) for action in actions: assert self.env.action_space.contains(action) def test_is_pickleable(self): with tf.compat.v1.variable_scope('SimpleQFunction/SimpleMLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) obs, _, _, _ = self.env.step(1) action1, _ = self.policy.get_action(obs) p = pickle.dumps(self.policy) with tf.compat.v1.Session(graph=tf.Graph()): policy_pickled = pickle.loads(p) action2, _ = policy_pickled.get_action(obs) assert action1 == action2
class TestPPO: """Test class for PPO.""" def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec) def teardown_method(self): """Teardown method which is called after every test.""" self.env.close() @pytest.mark.mujoco def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) runner = LocalRunner(snapshot_config) algo = PPO(env_spec=self.env.spec, policy=self.policy, value_function=self.value_function, max_path_length=100, discount=0.99, gae_lambda=0.97, lr_clip_range=2e-1) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 0
def test_is_pickleable(self): env = MetaRLEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = CategoricalLSTMPolicy(env_spec=env.spec, state_include_action=False) policy.build(obs_var) policy.reset() obs = env.reset() policy.model._lstm_cell.weights[0].load( tf.ones_like(policy.model._lstm_cell.weights[0]).eval()) output1 = self.sess.run( [policy.distribution.probs], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy_pickled.build(obs_var) output2 = sess.run([policy_pickled.distribution.probs], feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) # noqa: E126 assert np.array_equal(output1, output2)
def test_is_pickleable(self, obs_dim, action_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() with tf.compat.v1.variable_scope( 'ContinuousMLPQFunction/SimpleMLPMergeModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = qf.get_qval([obs], [act]) h_data = pickle.dumps(qf) with tf.compat.v1.Session(graph=tf.Graph()): qf_pickled = pickle.loads(h_data) output2 = qf_pickled.get_qval([obs], [act]) assert np.array_equal(output1, output2)
def test_get_action(self, obs_dim, task_num, latent_dim, action_dim): env = MetaRLEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) embedding_spec = InOutSpec( input_space=akro.Box(low=np.zeros(task_num), high=np.ones(task_num)), output_space=akro.Box(low=np.zeros(latent_dim), high=np.ones(latent_dim))) encoder = GaussianMLPEncoder(embedding_spec) policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec, encoder=encoder) env.reset() obs, _, _, _ = env.step(1) latent = np.random.random((latent_dim, )) task = np.zeros(task_num) task[0] = 1 action1, _ = policy.get_action_given_latent(obs, latent) action2, _ = policy.get_action_given_task(obs, task) action3, _ = policy.get_action(np.concatenate([obs.flatten(), task])) assert env.action_space.contains(action1) assert env.action_space.contains(action2) assert env.action_space.contains(action3) obses, latents, tasks = [obs] * 3, [latent] * 3, [task] * 3 aug_obses = [np.concatenate([obs.flatten(), task])] * 3 action1n, _ = policy.get_actions_given_latents(obses, latents) action2n, _ = policy.get_actions_given_tasks(obses, tasks) action3n, _ = policy.get_actions(aug_obses) for action in chain(action1n, action2n, action3n): assert env.action_space.contains(action)
def maml_trpo_metaworld_ml10(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv( normalize(mwb.ML10.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 test_task_names = mwb.ML10.get_test_tasks().all_task_names test_tasks = [ MetaRLEnv( normalize(mwb.ML10.from_task(task), expected_action_scale=10.)) for task in test_task_names ] test_sampler = EnvPoolSampler(test_tasks) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length, n_test_tasks=len(test_task_names)) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_update_envs_env_update(): max_path_length = 16 env = MetaRLEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env) rollouts = sampler.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for rollout in rollouts.split(): mean_rewards.append(rollout.rewards.mean()) goals.append(rollout.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1)) sampler.shutdown_worker() env.close()
def test_init_with_crashed_worker(): max_path_length = 16 env = MetaRLEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) n_workers = 2 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) class CrashingPolicy: def reset(self, **kwargs): raise Exception('Intentional subprocess crash') bad_policy = CrashingPolicy() # This causes worker 2 to crash. sampler = MultiprocessingSampler.from_worker_factory( workers, [policy, bad_policy], envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, None) assert sum(rollouts.lengths) >= 160 sampler.shutdown_worker() env.close()
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = MetaRLEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, agent_update=policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all() sampler.shutdown_worker() env.close()
def test_ddpg_double_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = MetaRLEnv(gym.make('InvertedDoublePendulum-v2')) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e5)) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 60 env.close()
def test_pickle_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = RandomPolicy(env.spec.action_space) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) meta_eval_pickle = cloudpickle.dumps(meta_eval) meta_eval2 = cloudpickle.loads(meta_eval_pickle) meta_eval2.evaluate(algo)
def test_task_embedding_worker(self): env = MetaRLEnv(DummyBoxEnv(obs_dim=(1, ))) env.active_task_one_hot = np.array([1., 0., 0., 0.]) env._active_task_one_hot = lambda: np.array([1., 0., 0., 0.]) a = np.random.random(env.action_space.shape) z = np.random.random(5) latent_info = dict(mean=np.random.random(5)) agent_info = dict(dummy='dummy') policy = Mock() policy.get_latent.return_value = (z, latent_info) policy.latent_space.flatten.return_value = z policy.get_action_given_latent.return_value = (a, agent_info) worker = TaskEmbeddingWorker(seed=1, max_path_length=100, worker_number=1) worker.update_agent(policy) worker.update_env(env) rollouts = worker.rollout() assert 'task_onehot' in rollouts.env_infos assert np.array_equal(rollouts.env_infos['task_onehot'][0], env.active_task_one_hot) assert 'latent' in rollouts.agent_infos assert np.array_equal(rollouts.agent_infos['latent'][0], z) assert 'latent_mean' in rollouts.agent_infos assert np.array_equal(rollouts.agent_infos['latent_mean'][0], latent_info['mean'])
def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockTFAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) tf.compat.v1.reset_default_graph() with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = MetaRLEnv(PointEnv()) algo = OptimalActionInference(env=env, max_path_length=max_path_length) runner.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float(rows[0]['MetaTest/__unnamed_task__/CompletionRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def test_time_limit_env(self): metarl_env = MetaRLEnv(env_name='Pendulum-v0') metarl_env.reset() for _ in range(200): _, _, done, info = metarl_env.step( metarl_env.spec.action_space.sample()) assert not done and info['TimeLimit.truncated'] assert info['MetaRLEnv.TimeLimitTerminated']
def setup_method(self): super().setup_method() self.env = MetaRLEnv(DummyDiscreteEnv()) self.qf = SimpleQFunction(self.env.spec) self.policy = DiscreteQfDerivedPolicy(env_spec=self.env.spec, qf=self.qf) self.sess.run(tf.compat.v1.global_variables_initializer()) self.env.reset()
def setup_method(self): """Setup method which is called before every test.""" self.env = MetaRLEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) self.policy = GaussianMLPPolicy( env_spec=self.env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec)
def maml_vpg_half_cheetah_dir(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 task_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(ctxt) algo = MAMLVPG(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)