def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockTFAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) tf.compat.v1.reset_default_graph() with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = MetaRLEnv(PointEnv()) algo = OptimalActionInference(env=env, max_path_length=max_path_length) runner.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float(rows[0]['MetaTest/__unnamed_task__/CompletionRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def test_pickle_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = RandomPolicy(env.spec.action_space) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) meta_eval_pickle = cloudpickle.dumps(meta_eval) meta_eval2 = cloudpickle.loads(meta_eval_pickle) meta_eval2.evaluate(algo)
def maml_trpo_metaworld_ml10(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv( normalize(mwb.ML10.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 test_task_names = mwb.ML10.get_test_tasks().all_task_names test_tasks = [ MetaRLEnv( normalize(mwb.ML10.from_task(task), expected_action_scale=10.)) for task in test_task_names ] test_sampler = EnvPoolSampler(test_tasks) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length, n_test_tasks=len(test_task_names)) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 task_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
def test_one_folder(self, meta_train_dir, itrs): snapshot_config = SnapshotConfig(snapshot_dir=meta_train_dir, snapshot_mode='all', snapshot_gap=1) runner = LocalRunner(snapshot_config=snapshot_config) meta_sampler = AllSetTaskSampler(self.meta_task_cls) runner.restore(meta_train_dir) meta_evaluator = MetaEvaluator( runner, test_task_sampler=meta_sampler, max_path_length=self.max_path_length, n_test_tasks=meta_sampler.n_tasks, n_exploration_traj=self.adapt_rollout_per_task, prefix='') for itr in itrs: log_filename = os.path.join(meta_train_dir, 'meta-test-itr_{}.csv'.format(itr)) logger.add_output(CsvOutput(log_filename)) logger.log("Writing into {}".format(log_filename)) runner.restore(meta_train_dir, from_epoch=itr) meta_evaluator.evaluate(runner._algo, self.test_rollout_per_task) tabular.record('Iteration', runner._stats.total_epoch) tabular.record('TotalEnvSteps', runner._stats.total_env_steps) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput)
def maml_vpg_half_cheetah_dir(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 task_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(ctxt) algo = MAMLVPG(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def __init__(self, env, inner_policy, qf, vf, num_train_tasks, num_test_tasks, latent_dim, encoder_hidden_sizes, test_env_sampler, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, policy_lr=3E-4, qf_lr=3E-4, vf_lr=3E-4, context_lr=3E-4, policy_mean_reg_coeff=1E-3, policy_std_reg_coeff=1E-3, policy_pre_activation_coeff=0., soft_target_tau=0.005, kl_lambda=.1, optimizer_class=torch.optim.Adam, use_information_bottleneck=True, use_next_obs_in_context=False, meta_batch_size=64, num_steps_per_epoch=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=0, num_extra_rl_steps_posterior=100, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, update_post_train=1): self._env = env self._qf1 = qf self._qf2 = copy.deepcopy(qf) self._vf = vf self._num_train_tasks = num_train_tasks self._num_test_tasks = num_test_tasks self._latent_dim = latent_dim self._policy_mean_reg_coeff = policy_mean_reg_coeff self._policy_std_reg_coeff = policy_std_reg_coeff self._policy_pre_activation_coeff = policy_pre_activation_coeff self._soft_target_tau = soft_target_tau self._kl_lambda = kl_lambda self._use_information_bottleneck = use_information_bottleneck self._use_next_obs_in_context = use_next_obs_in_context self._meta_batch_size = meta_batch_size self._num_steps_per_epoch = num_steps_per_epoch self._num_initial_steps = num_initial_steps self._num_tasks_sample = num_tasks_sample self._num_steps_prior = num_steps_prior self._num_steps_posterior = num_steps_posterior self._num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self._batch_size = batch_size self._embedding_batch_size = embedding_batch_size self._embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self._discount = discount self._replay_buffer_size = replay_buffer_size self._reward_scale = reward_scale self._update_post_train = update_post_train self._task_idx = None self._is_resuming = False worker_args = dict(deterministic=True, accum_context=True) self._evaluator = MetaEvaluator(test_task_sampler=test_env_sampler, max_path_length=max_path_length, worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=num_test_tasks) encoder_spec = self.get_env_spec(env[0](), latent_dim, 'encoder') encoder_in_dim = int(np.prod(encoder_spec.input_space.shape)) encoder_out_dim = int(np.prod(encoder_spec.output_space.shape)) context_encoder = encoder_class(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=encoder_hidden_sizes) self._policy = policy_class( latent_dim=latent_dim, context_encoder=context_encoder, policy=inner_policy, use_information_bottleneck=use_information_bottleneck, use_next_obs=use_next_obs_in_context) # buffer for training RL update self._replay_buffers = { i: PathBuffer(replay_buffer_size) for i in range(num_train_tasks) } self._context_replay_buffers = { i: PathBuffer(replay_buffer_size) for i in range(num_train_tasks) } self.target_vf = copy.deepcopy(self._vf) self.vf_criterion = torch.nn.MSELoss() self._policy_optimizer = optimizer_class( self._policy.networks[1].parameters(), lr=policy_lr, ) self.qf1_optimizer = optimizer_class( self._qf1.parameters(), lr=qf_lr, ) self.qf2_optimizer = optimizer_class( self._qf2.parameters(), lr=qf_lr, ) self.vf_optimizer = optimizer_class( self._vf.parameters(), lr=vf_lr, ) self.context_optimizer = optimizer_class( self._policy.networks[0].parameters(), lr=context_lr, )