def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = MetaRLEnv(PointEnv()) algo = OptimalActionInference(env=env, max_path_length=max_path_length) runner.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float(rows[0]['MetaTest/__unnamed_task__/CompletionRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def test_pickle_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = RandomPolicy(env.spec.action_space) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) meta_eval_pickle = cloudpickle.dumps(meta_eval) meta_eval2 = cloudpickle.loads(meta_eval_pickle) meta_eval2.evaluate(algo)
def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockTFAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) tf.compat.v1.reset_default_graph() with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def multi_env_trpo(ctxt=None, seed=1): """Train TRPO on two different PointEnv instances. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. """ set_seed(seed) with LocalTFRunner(ctxt) as runner: env1 = MetaRLEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = MetaRLEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def test_init_with_crashed_worker(): max_path_length = 16 env = MetaRLEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) n_workers = 2 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) class CrashingPolicy: def reset(self, **kwargs): raise Exception('Intentional subprocess crash') bad_policy = CrashingPolicy() # This causes worker 2 to crash. sampler = MultiprocessingSampler.from_worker_factory( workers, [policy, bad_policy], envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, None) assert sum(rollouts.lengths) >= 160 sampler.shutdown_worker() env.close()
def setup_method(self): self.env = PointEnv() self.base_len = len(self.env.reset()) self.n_total_tasks = 5 self.task_index = 1 self.wrapped = TaskOnehotWrapper(self.env, self.task_index, self.n_total_tasks)
def run_task(snapshot_config, *_): """Run task. Args: snapshot_config (metarl.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. _ (object): Ignored by this function. """ with LocalTFRunner(snapshot_config=snapshot_config) as runner: env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.)))) env2 = TfEnv(normalize(PointEnv(goal=(1., 0.)))) env = MultiEnvWrapper([env1, env2]) policy = GaussianMLPPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=2048, plot=False)
def test_observation_dimension_with_max_obs_dim(self): env = PointEnv() wrapped_env = RL2Env(PointEnv(), max_obs_dim=10) assert wrapped_env.spec.observation_space.shape[ 0] == 10 + env.action_space.shape[0] + 2 obs = wrapped_env.reset() assert 10 + env.action_space.shape[0] + 2 == obs.shape[0] obs, _, _, _ = wrapped_env.step(env.action_space.sample()) assert 10 + env.action_space.shape[0] + 2 == obs.shape[0]
def test_observation_dimension(self): env = PointEnv() wrapped_env = RL2Env(PointEnv()) assert wrapped_env.spec.observation_space.shape[0] == ( env.observation_space.shape[0] + env.action_space.shape[0] + 2) obs = env.reset() obs2 = wrapped_env.reset() assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0] obs, _, _, _ = env.step(env.action_space.sample()) obs2, _, _, _ = wrapped_env.step(env.action_space.sample()) assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0]
def test_wrapped_env_list_produces_correct_onehots(): envs = [PointEnv(), PointEnv(), PointEnv(), PointEnv()] base_len = len(envs[0].reset()) n_total_tasks = len(envs) wrapped = TaskOnehotWrapper.wrap_env_list(envs) assert len(wrapped) == n_total_tasks for i, env in enumerate(wrapped): obs = env.reset() assert len(obs) == base_len + n_total_tasks onehot = np.zeros(n_total_tasks) onehot[i] = 1. assert (obs[-n_total_tasks:] == onehot).all() next_obs, _, _, _ = env.step(env.action_space.sample()) assert (next_obs[-n_total_tasks:] == onehot).all()
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = TfEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, agent_update=policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all()
def test_update_envs_env_update(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, env) rollouts = sampler.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for rollout in rollouts.split(): mean_rewards.append(rollout.rewards.mean()) goals.append(rollout.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1))
def test_pickle(): max_path_length = 16 env = MetaRLEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env) sampler_pickled = pickle.dumps(sampler) sampler.shutdown_worker() sampler2 = pickle.loads(sampler_pickled) rollouts = sampler2.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for rollout in rollouts.split(): mean_rewards.append(rollout.rewards.mean()) goals.append(rollout.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 sampler2.shutdown_worker() env.close()
def test_init_with_env_updates(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: TfEnv(PointEnv())) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = LocalSampler.from_worker_factory(workers, policy, envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160
def test_does_not_modify_action(self): inner_env = PointEnv(goal=(1., 2.)) env = NormalizedEnv(inner_env, scale_reward=10.) a = env.action_space.high + 1. a_copy = a env.reset() env.step(a) assert np.array_equal(a, a_copy) env.close()
def test_pickleable(self): inner_env = PointEnv(goal=(1., 2.)) env = NormalizedEnv(inner_env, scale_reward=10.) round_trip = pickle.loads(pickle.dumps(env)) assert round_trip assert round_trip._scale_reward == env._scale_reward assert np.array_equal(round_trip.env._goal, env.env._goal) step_env(round_trip) round_trip.close() env.close()
def test_init_with_env_updates(ray_local_session_fixture): del ray_local_session_fixture assert ray.is_initialized() max_path_length = 16 env = MetaRLEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = RaySampler.from_worker_factory(workers, policy, envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160
class TestSingleWrappedEnv: def setup_method(self): self.env = PointEnv() self.base_len = len(self.env.reset()) self.n_total_tasks = 5 self.task_index = 1 self.wrapped = TaskOnehotWrapper(self.env, self.task_index, self.n_total_tasks) def test_produces_correct_onehots(self): obs = self.wrapped.reset() assert len(obs) == self.base_len + self.n_total_tasks assert (obs[-self.n_total_tasks:] == np.array([0, 1, 0, 0, 0])).all() def test_spec_obs_space(self): obs = self.wrapped.reset() assert self.wrapped.observation_space.contains(obs) assert self.wrapped.spec.observation_space.contains(obs) assert (self.wrapped.spec.observation_space == self.wrapped.observation_space)
def test_meta_evaluator_n_traj(): set_seed(100) tasks = SetTaskSampler(PointEnv) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) algo = MockAlgo(env, max_path_length, n_traj) runner.setup(algo, env) meta_eval = MetaEvaluator(runner, test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo)
def setup_method(self): super().setup_method() def circle(r, n): """Generate n points on a circle of radius r. Args: r (float): Radius of the circle. n (int): Number of points to generate. Yields: tuple(float, float): Coordinate of a point. """ for t in np.arange(0, 2 * np.pi, 2 * np.pi / n): yield r * np.sin(t), r * np.cos(t) N = 4 goals = circle(3.0, N) tasks = { str(i + 1): { 'args': [], 'kwargs': { 'goal': g, 'never_done': False, 'done_bonus': 0.0, } } for i, g in enumerate(goals) } latent_length = 1 inference_window = 2 self.batch_size = 100 * len(tasks) self.policy_ent_coeff = 2e-2 self.encoder_ent_coeff = 2.2e-3 self.inference_ce_coeff = 5e-2 self.max_path_length = 100 embedding_init_std = 1.0 embedding_max_std = 2.0 embedding_min_std = 0.38 policy_init_std = 1.0 policy_max_std = None policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] task_envs = [ MetaRLEnv(PointEnv(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] self.env = env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') latent_lb = np.zeros(latent_length, ) latent_ub = np.ones(latent_length, ) latent_space = akro.Box(latent_lb, latent_ub) obs_lb, obs_ub = env.observation_space.bounds obs_lb_flat = env.observation_space.flatten(obs_lb) obs_ub_flat = env.observation_space.flatten(obs_ub) traj_lb = np.stack([obs_lb_flat] * inference_window) traj_ub = np.stack([obs_ub_flat] * inference_window) traj_space = akro.Box(traj_lb, traj_ub) task_embed_spec = InOutSpec(env.task_space, latent_space) traj_embed_spec = InOutSpec(traj_space, latent_space) self.inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=[20, 10], std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=[20, 20], std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) self.policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=[32, 16], std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) self.baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents'])
def te_ppo_pointenv(ctxt, seed, n_epochs, batch_size_per_task): """Train Task Embedding PPO with PointEnv. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. n_epochs (int): Total number of epochs for training. batch_size_per_task (int): Batch size of samples for each task. """ set_seed(seed) tasks = TASKS latent_length = 2 inference_window = 6 batch_size = batch_size_per_task * len(TASKS) policy_ent_coeff = 2e-2 encoder_ent_coeff = 2.e-4 inference_ce_coeff = 5e-2 max_path_length = 100 embedding_init_std = 0.01 embedding_max_std = 0.02 embedding_min_std = 1e-6 policy_init_std = 1.0 policy_max_std = None policy_min_std = None task_names = sorted(tasks.keys()) task_args = [tasks[t]['args'] for t in task_names] task_kwargs = [tasks[t]['kwargs'] for t in task_names] with LocalTFRunner(snapshot_config=ctxt) as runner: task_envs = [ MetaRLEnv(PointEnv(*t_args, **t_kwargs)) for t_args, t_kwargs in zip(task_args, task_kwargs) ] env = MultiEnvWrapper(task_envs, round_robin_strategy, mode='vanilla') task_embed_spec = TEPPO.get_encoder_spec(env.task_space, latent_dim=latent_length) task_encoder = GaussianMLPEncoder( name='embedding', embedding_spec=task_embed_spec, hidden_sizes=[20, 20], std_share_network=True, init_std=embedding_init_std, max_std=embedding_max_std, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) traj_embed_spec = TEPPO.get_infer_spec( env.spec, latent_dim=latent_length, inference_window_size=inference_window) inference = GaussianMLPEncoder( name='inference', embedding_spec=traj_embed_spec, hidden_sizes=[20, 10], std_share_network=True, init_std=2.0, output_nonlinearity=tf.nn.tanh, min_std=embedding_min_std, ) policy = GaussianMLPTaskEmbeddingPolicy( name='policy', env_spec=env.spec, encoder=task_encoder, hidden_sizes=[32, 16], std_share_network=True, max_std=policy_max_std, init_std=policy_init_std, min_std=policy_min_std, ) baseline = LinearMultiFeatureBaseline( env_spec=env.spec, features=['observations', 'tasks', 'latents']) algo = TEPPO(env_spec=env.spec, policy=policy, baseline=baseline, inference=inference, max_path_length=max_path_length, discount=0.99, lr_clip_range=0.2, policy_ent_coeff=policy_ent_coeff, encoder_ent_coeff=encoder_ent_coeff, inference_ce_coeff=inference_ce_coeff, entropy_method='max', use_softplus_entropy=True, optimizer_args=dict( batch_size=32, max_epochs=10, learning_rate=1e-3, ), inference_optimizer_args=dict( batch_size=32, max_epochs=10, ), center_adv=True, stop_entropy_gradient=True, stop_ce_gradient=True) runner.setup(algo, env, sampler_cls=LocalSampler, sampler_args=None, worker_class=TaskEmbeddingWorker) runner.train(n_epochs=n_epochs, batch_size=batch_size, plot=False)