def test_meta_evaluator_with_tf(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: ctxt = SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='none', snapshot_gap=1) with LocalTFRunner(ctxt) as runner: meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = GaussianMLPPolicy(env.spec) algo = MockTFAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) algo_pickle = cloudpickle.dumps(algo) tf.compat.v1.reset_default_graph() with LocalTFRunner(ctxt) as runner: algo2 = cloudpickle.loads(algo_pickle) runner.setup(algo2, env) runner.train(10, 0)
def test_pickle_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 env = MetaRLEnv(PointEnv()) n_traj = 3 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10, n_exploration_traj=n_traj) policy = RandomPolicy(env.spec.action_space) algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval) runner.setup(algo, env) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) meta_eval_pickle = cloudpickle.dumps(meta_eval) meta_eval2 = cloudpickle.loads(meta_eval_pickle) meta_eval2.evaluate(algo)
def test_one_folder(self, meta_train_dir, itrs): snapshot_config = SnapshotConfig(snapshot_dir=meta_train_dir, snapshot_mode='all', snapshot_gap=1) runner = LocalRunner(snapshot_config=snapshot_config) meta_sampler = AllSetTaskSampler(self.meta_task_cls) runner.restore(meta_train_dir) meta_evaluator = MetaEvaluator( runner, test_task_sampler=meta_sampler, max_path_length=self.max_path_length, n_test_tasks=meta_sampler.n_tasks, n_exploration_traj=self.adapt_rollout_per_task, prefix='') for itr in itrs: log_filename = os.path.join(meta_train_dir, 'meta-test-itr_{}.csv'.format(itr)) logger.add_output(CsvOutput(log_filename)) logger.log("Writing into {}".format(log_filename)) runner.restore(meta_train_dir, from_epoch=itr) meta_evaluator.evaluate(runner._algo, self.test_rollout_per_task) tabular.record('Iteration', runner._stats.total_epoch) tabular.record('TotalEnvSteps', runner._stats.total_env_steps) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput)
def maml_trpo_metaworld_ml10(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv( normalize(mwb.ML10.get_train_tasks(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(100, 100), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 test_task_names = mwb.ML10.get_test_tasks().all_task_names test_tasks = [ MetaRLEnv( normalize(mwb.ML10.from_task(task), expected_action_scale=10.)) for task in test_task_names ] test_sampler = EnvPoolSampler(test_tasks) meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, max_path_length=max_path_length, n_test_tasks=len(test_task_names)) runner = LocalRunner(ctxt) algo = MAMLTRPO(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_ppo_pendulum(self): """Test PPO with Pendulum environment.""" deterministic.set_seed(0) rollouts_per_task = 5 max_path_length = 100 task_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(snapshot_config) algo = MAMLVPG(env=self.env, policy=self.policy, value_function=self.value_function, max_path_length=max_path_length, meta_batch_size=5, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, self.env) last_avg_ret = runner.train(n_epochs=10, batch_size=rollouts_per_task * max_path_length) assert last_avg_ret > -5
def maml_vpg_half_cheetah_dir(ctxt, seed, epochs, rollouts_per_task, meta_batch_size): """Set up environment and algorithm and run the task. Args: ctxt (metarl.experiment.ExperimentContext): The experiment configuration used by LocalRunner to create the snapshotter. seed (int): Used to seed the random number generator to produce determinism. epochs (int): Number of training epochs. rollouts_per_task (int): Number of rollouts per epoch per task for training. meta_batch_size (int): Number of tasks sampled per batch. """ set_seed(seed) env = MetaRLEnv(normalize(HalfCheetahDirEnv(), expected_action_scale=10.)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=torch.tanh, output_nonlinearity=None, ) value_function = GaussianMLPValueFunction(env_spec=env.spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, output_nonlinearity=None) max_path_length = 100 task_sampler = SetTaskSampler(lambda: MetaRLEnv( normalize(HalfCheetahDirEnv(), expected_action_scale=10.))) meta_evaluator = MetaEvaluator(test_task_sampler=task_sampler, max_path_length=max_path_length, n_test_tasks=1, n_test_rollouts=10) runner = LocalRunner(ctxt) algo = MAMLVPG(env=env, policy=policy, value_function=value_function, max_path_length=max_path_length, meta_batch_size=meta_batch_size, discount=0.99, gae_lambda=1., inner_lr=0.1, num_grad_updates=1, meta_evaluator=meta_evaluator) runner.setup(algo, env) runner.train(n_epochs=epochs, batch_size=rollouts_per_task * max_path_length)
def test_meta_evaluator(): set_seed(100) tasks = SetTaskSampler(lambda: MetaRLEnv(PointEnv())) max_path_length = 200 with tempfile.TemporaryDirectory() as log_dir_name: runner = LocalRunner( SnapshotConfig(snapshot_dir=log_dir_name, snapshot_mode='last', snapshot_gap=1)) env = MetaRLEnv(PointEnv()) algo = OptimalActionInference(env=env, max_path_length=max_path_length) runner.setup(algo, env) meta_eval = MetaEvaluator(test_task_sampler=tasks, max_path_length=max_path_length, n_test_tasks=10) log_file = tempfile.NamedTemporaryFile() csv_output = CsvOutput(log_file.name) logger.add_output(csv_output) meta_eval.evaluate(algo) logger.log(tabular) meta_eval.evaluate(algo) logger.log(tabular) logger.dump_output_type(CsvOutput) logger.remove_output_type(CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) assert len(rows) == 2 assert float(rows[0]['MetaTest/__unnamed_task__/CompletionRate']) < 1.0 assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0 assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float( rows[0]['MetaTest/__unnamed_task__/AverageReturn'])) assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >= float(rows[0]['MetaTest/__unnamed_task__/MinReturn'])) assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
def __init__(self, env, inner_policy, qf, vf, num_train_tasks, num_test_tasks, latent_dim, encoder_hidden_sizes, test_env_sampler, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, policy_lr=3E-4, qf_lr=3E-4, vf_lr=3E-4, context_lr=3E-4, policy_mean_reg_coeff=1E-3, policy_std_reg_coeff=1E-3, policy_pre_activation_coeff=0., soft_target_tau=0.005, kl_lambda=.1, optimizer_class=torch.optim.Adam, use_information_bottleneck=True, use_next_obs_in_context=False, meta_batch_size=64, num_steps_per_epoch=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=0, num_extra_rl_steps_posterior=100, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, update_post_train=1): self._env = env self._qf1 = qf self._qf2 = copy.deepcopy(qf) self._vf = vf self._num_train_tasks = num_train_tasks self._num_test_tasks = num_test_tasks self._latent_dim = latent_dim self._policy_mean_reg_coeff = policy_mean_reg_coeff self._policy_std_reg_coeff = policy_std_reg_coeff self._policy_pre_activation_coeff = policy_pre_activation_coeff self._soft_target_tau = soft_target_tau self._kl_lambda = kl_lambda self._use_information_bottleneck = use_information_bottleneck self._use_next_obs_in_context = use_next_obs_in_context self._meta_batch_size = meta_batch_size self._num_steps_per_epoch = num_steps_per_epoch self._num_initial_steps = num_initial_steps self._num_tasks_sample = num_tasks_sample self._num_steps_prior = num_steps_prior self._num_steps_posterior = num_steps_posterior self._num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self._batch_size = batch_size self._embedding_batch_size = embedding_batch_size self._embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self._discount = discount self._replay_buffer_size = replay_buffer_size self._reward_scale = reward_scale self._update_post_train = update_post_train self._task_idx = None self._is_resuming = False worker_args = dict(deterministic=True, accum_context=True) self._evaluator = MetaEvaluator(test_task_sampler=test_env_sampler, max_path_length=max_path_length, worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=num_test_tasks) encoder_spec = self.get_env_spec(env[0](), latent_dim, 'encoder') encoder_in_dim = int(np.prod(encoder_spec.input_space.shape)) encoder_out_dim = int(np.prod(encoder_spec.output_space.shape)) context_encoder = encoder_class(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=encoder_hidden_sizes) self._policy = policy_class( latent_dim=latent_dim, context_encoder=context_encoder, policy=inner_policy, use_information_bottleneck=use_information_bottleneck, use_next_obs=use_next_obs_in_context) # buffer for training RL update self._replay_buffers = { i: PathBuffer(replay_buffer_size) for i in range(num_train_tasks) } self._context_replay_buffers = { i: PathBuffer(replay_buffer_size) for i in range(num_train_tasks) } self.target_vf = copy.deepcopy(self._vf) self.vf_criterion = torch.nn.MSELoss() self._policy_optimizer = optimizer_class( self._policy.networks[1].parameters(), lr=policy_lr, ) self.qf1_optimizer = optimizer_class( self._qf1.parameters(), lr=qf_lr, ) self.qf2_optimizer = optimizer_class( self._qf2.parameters(), lr=qf_lr, ) self.vf_optimizer = optimizer_class( self._vf.parameters(), lr=vf_lr, ) self.context_optimizer = optimizer_class( self._policy.networks[0].parameters(), lr=context_lr, )
class PEARL(MetaRLAlgorithm): r"""A PEARL model based on https://arxiv.org/abs/1903.08254. PEARL, which stands for Probablistic Embeddings for Actor-Critic Reinforcement Learning, is an off-policy meta-RL algorithm. It is built on top of SAC using two Q-functions and a value function with an addition of an inference network that estimates the posterior :math:`q(z \| c)`. The policy is conditioned on the latent variable Z in order to adpat its behavior to specific tasks. Args: env (list[MetaRLEnv]): Batch of sampled environment updates(EnvUpdate), which, when invoked on environments, will configure them with new tasks. policy_class (metarl.torch.policies.Policy): Context-conditioned policy class. encoder_class (metarl.torch.embeddings.ContextEncoder): Encoder class for the encoder in context-conditioned policy. inner_policy (metarl.torch.policies.Policy): Policy. qf (torch.nn.Module): Q-function. vf (torch.nn.Module): Value function. num_train_tasks (int): Number of tasks for training. num_test_tasks (int): Number of tasks for testing. latent_dim (int): Size of latent context vector. encoder_hidden_sizes (list[int]): Output dimension of dense layer(s) of the context encoder. test_env_sampler (metarl.experiment.SetTaskSampler): Sampler for test tasks. policy_lr (float): Policy learning rate. qf_lr (float): Q-function learning rate. vf_lr (float): Value function learning rate. context_lr (float): Inference network learning rate. policy_mean_reg_coeff (float): Policy mean regulation weight. policy_std_reg_coeff (float): Policy std regulation weight. policy_pre_activation_coeff (float): Policy pre-activation weight. soft_target_tau (float): Interpolation parameter for doing the soft target update. kl_lambda (float): KL lambda value. optimizer_class (callable): Type of optimizer for training networks. use_information_bottleneck (bool): False means latent context is deterministic. use_next_obs_in_context (bool): Whether or not to use next observation in distinguishing between tasks. meta_batch_size (int): Meta batch size. num_steps_per_epoch (int): Number of iterations per epoch. num_initial_steps (int): Number of transitions obtained per task before training. num_tasks_sample (int): Number of random tasks to obtain data for each iteration. num_steps_prior (int): Number of transitions to obtain per task with z ~ prior. num_steps_posterior (int): Number of transitions to obtain per task with z ~ posterior. num_extra_rl_steps_posterior (int): Number of additional transitions to obtain per task with z ~ posterior that are only used to train the policy and NOT the encoder. batch_size (int): Number of transitions in RL batch. embedding_batch_size (int): Number of transitions in context batch. embedding_mini_batch_size (int): Number of transitions in mini context batch; should be same as embedding_batch_size for non-recurrent encoder. max_path_length (int): Maximum path length. discount (float): RL discount factor. replay_buffer_size (int): Maximum samples in replay buffer. reward_scale (int): Reward scale. update_post_train (int): How often to resample context when obtaining data during training (in trajectories). """ # pylint: disable=too-many-statements def __init__(self, env, inner_policy, qf, vf, num_train_tasks, num_test_tasks, latent_dim, encoder_hidden_sizes, test_env_sampler, policy_class=ContextConditionedPolicy, encoder_class=MLPEncoder, policy_lr=3E-4, qf_lr=3E-4, vf_lr=3E-4, context_lr=3E-4, policy_mean_reg_coeff=1E-3, policy_std_reg_coeff=1E-3, policy_pre_activation_coeff=0., soft_target_tau=0.005, kl_lambda=.1, optimizer_class=torch.optim.Adam, use_information_bottleneck=True, use_next_obs_in_context=False, meta_batch_size=64, num_steps_per_epoch=1000, num_initial_steps=100, num_tasks_sample=100, num_steps_prior=100, num_steps_posterior=0, num_extra_rl_steps_posterior=100, batch_size=1024, embedding_batch_size=1024, embedding_mini_batch_size=1024, max_path_length=1000, discount=0.99, replay_buffer_size=1000000, reward_scale=1, update_post_train=1): self._env = env self._qf1 = qf self._qf2 = copy.deepcopy(qf) self._vf = vf self._num_train_tasks = num_train_tasks self._num_test_tasks = num_test_tasks self._latent_dim = latent_dim self._policy_mean_reg_coeff = policy_mean_reg_coeff self._policy_std_reg_coeff = policy_std_reg_coeff self._policy_pre_activation_coeff = policy_pre_activation_coeff self._soft_target_tau = soft_target_tau self._kl_lambda = kl_lambda self._use_information_bottleneck = use_information_bottleneck self._use_next_obs_in_context = use_next_obs_in_context self._meta_batch_size = meta_batch_size self._num_steps_per_epoch = num_steps_per_epoch self._num_initial_steps = num_initial_steps self._num_tasks_sample = num_tasks_sample self._num_steps_prior = num_steps_prior self._num_steps_posterior = num_steps_posterior self._num_extra_rl_steps_posterior = num_extra_rl_steps_posterior self._batch_size = batch_size self._embedding_batch_size = embedding_batch_size self._embedding_mini_batch_size = embedding_mini_batch_size self.max_path_length = max_path_length self._discount = discount self._replay_buffer_size = replay_buffer_size self._reward_scale = reward_scale self._update_post_train = update_post_train self._task_idx = None self._is_resuming = False worker_args = dict(deterministic=True, accum_context=True) self._evaluator = MetaEvaluator(test_task_sampler=test_env_sampler, max_path_length=max_path_length, worker_class=PEARLWorker, worker_args=worker_args, n_test_tasks=num_test_tasks) encoder_spec = self.get_env_spec(env[0](), latent_dim, 'encoder') encoder_in_dim = int(np.prod(encoder_spec.input_space.shape)) encoder_out_dim = int(np.prod(encoder_spec.output_space.shape)) context_encoder = encoder_class(input_dim=encoder_in_dim, output_dim=encoder_out_dim, hidden_sizes=encoder_hidden_sizes) self._policy = policy_class( latent_dim=latent_dim, context_encoder=context_encoder, policy=inner_policy, use_information_bottleneck=use_information_bottleneck, use_next_obs=use_next_obs_in_context) # buffer for training RL update self._replay_buffers = { i: PathBuffer(replay_buffer_size) for i in range(num_train_tasks) } self._context_replay_buffers = { i: PathBuffer(replay_buffer_size) for i in range(num_train_tasks) } self.target_vf = copy.deepcopy(self._vf) self.vf_criterion = torch.nn.MSELoss() self._policy_optimizer = optimizer_class( self._policy.networks[1].parameters(), lr=policy_lr, ) self.qf1_optimizer = optimizer_class( self._qf1.parameters(), lr=qf_lr, ) self.qf2_optimizer = optimizer_class( self._qf2.parameters(), lr=qf_lr, ) self.vf_optimizer = optimizer_class( self._vf.parameters(), lr=vf_lr, ) self.context_optimizer = optimizer_class( self._policy.networks[0].parameters(), lr=context_lr, ) def __getstate__(self): """Object.__getstate__. Returns: dict: the state to be pickled for the instance. """ data = self.__dict__.copy() del data['_replay_buffers'] del data['_context_replay_buffers'] return data def __setstate__(self, state): """Object.__setstate__. Args: state (dict): unpickled state. """ self.__dict__.update(state) self._replay_buffers = { i: PathBuffer(self._replay_buffer_size) for i in range(self._num_train_tasks) } self._context_replay_buffers = { i: PathBuffer(self._replay_buffer_size) for i in range(self._num_train_tasks) } self._is_resuming = True def train(self, runner): """Obtain samples, train, and evaluate for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. """ for _ in runner.step_epochs(): epoch = runner.step_itr / self._num_steps_per_epoch # obtain initial set of samples from all train tasks if epoch == 0 or self._is_resuming: for idx in range(self._num_train_tasks): self._task_idx = idx self._obtain_samples(runner, epoch, self._num_initial_steps, np.inf) self._is_resuming = False # obtain samples from random tasks for _ in range(self._num_tasks_sample): idx = np.random.randint(self._num_train_tasks) self._task_idx = idx self._context_replay_buffers[idx].clear() # obtain samples with z ~ prior if self._num_steps_prior > 0: self._obtain_samples(runner, epoch, self._num_steps_prior, np.inf) # obtain samples with z ~ posterior if self._num_steps_posterior > 0: self._obtain_samples(runner, epoch, self._num_steps_posterior, self._update_post_train) # obtain extras samples for RL training but not encoder if self._num_extra_rl_steps_posterior > 0: self._obtain_samples(runner, epoch, self._num_extra_rl_steps_posterior, self._update_post_train, add_to_enc_buffer=False) logger.log('Training...') # sample train tasks and optimize networks self._train_once() runner.step_itr += 1 logger.log('Evaluating...') # evaluate self._policy.reset_belief() self._evaluator.evaluate(self) def _train_once(self): """Perform one iteration of training.""" for _ in range(self._num_steps_per_epoch): indices = np.random.choice(range(self._num_train_tasks), self._meta_batch_size) self._optimize_policy(indices) def _optimize_policy(self, indices): """Perform algorithm optimizing. Args: indices (list): Tasks used for training. """ num_tasks = len(indices) context = self._sample_context(indices) # clear context and reset belief of policy self._policy.reset_belief(num_tasks=num_tasks) # data shape is (task, batch, feat) obs, actions, rewards, next_obs, terms = self._sample_data(indices) policy_outputs, task_z = self._policy(obs, context) new_actions, policy_mean, policy_log_std, log_pi = policy_outputs[:4] # flatten out the task dimension t, b, _ = obs.size() obs = obs.view(t * b, -1) actions = actions.view(t * b, -1) next_obs = next_obs.view(t * b, -1) # optimize qf and encoder networks q1_pred = self._qf1(torch.cat([obs, actions], dim=1), task_z) q2_pred = self._qf2(torch.cat([obs, actions], dim=1), task_z) v_pred = self._vf(obs, task_z.detach()) with torch.no_grad(): target_v_values = self.target_vf(next_obs, task_z) # KL constraint on z if probabilistic self.context_optimizer.zero_grad() if self._use_information_bottleneck: kl_div = self._policy.compute_kl_div() kl_loss = self._kl_lambda * kl_div kl_loss.backward(retain_graph=True) self.qf1_optimizer.zero_grad() self.qf2_optimizer.zero_grad() rewards_flat = rewards.view(self._batch_size * num_tasks, -1) rewards_flat = rewards_flat * self._reward_scale terms_flat = terms.view(self._batch_size * num_tasks, -1) q_target = rewards_flat + ( 1. - terms_flat) * self._discount * target_v_values qf_loss = torch.mean((q1_pred - q_target)**2) + torch.mean( (q2_pred - q_target)**2) qf_loss.backward() self.qf1_optimizer.step() self.qf2_optimizer.step() self.context_optimizer.step() # compute min Q on the new actions q1 = self._qf1(torch.cat([obs, new_actions], dim=1), task_z.detach()) q2 = self._qf2(torch.cat([obs, new_actions], dim=1), task_z.detach()) min_q = torch.min(q1, q2) # optimize vf v_target = min_q - log_pi vf_loss = self.vf_criterion(v_pred, v_target.detach()) self.vf_optimizer.zero_grad() vf_loss.backward() self.vf_optimizer.step() self._update_target_network() # optimize policy log_policy_target = min_q policy_loss = (log_pi - log_policy_target).mean() mean_reg_loss = self._policy_mean_reg_coeff * (policy_mean**2).mean() std_reg_loss = self._policy_std_reg_coeff * (policy_log_std**2).mean() pre_tanh_value = policy_outputs[-1] pre_activation_reg_loss = self._policy_pre_activation_coeff * ( (pre_tanh_value**2).sum(dim=1).mean()) policy_reg_loss = (mean_reg_loss + std_reg_loss + pre_activation_reg_loss) policy_loss = policy_loss + policy_reg_loss self._policy_optimizer.zero_grad() policy_loss.backward() self._policy_optimizer.step() def _obtain_samples(self, runner, itr, num_samples, update_posterior_rate, add_to_enc_buffer=True): """Obtain samples. Args: runner (LocalRunner): LocalRunner. itr (int): Index of iteration (epoch). num_samples (int): Number of samples to obtain. update_posterior_rate (int): How often (in trajectories) to infer posterior of policy. add_to_enc_buffer (bool): Whether or not to add samples to encoder buffer. """ self._policy.reset_belief() total_samples = 0 if update_posterior_rate != np.inf: num_samples_per_batch = (update_posterior_rate * self.max_path_length) else: num_samples_per_batch = num_samples while total_samples < num_samples: paths = runner.obtain_samples(itr, num_samples_per_batch, self._policy, self._env[self._task_idx]) total_samples += sum([len(path['rewards']) for path in paths]) for path in paths: p = { 'observations': path['observations'], 'actions': path['actions'], 'rewards': path['rewards'].reshape(-1, 1), 'next_observations': path['next_observations'], 'dones': path['dones'].reshape(-1, 1) } self._replay_buffers[self._task_idx].add_path(p) if add_to_enc_buffer: self._context_replay_buffers[self._task_idx].add_path(p) if update_posterior_rate != np.inf: context = self._sample_context(self._task_idx) self._policy.infer_posterior(context) def _sample_data(self, indices): """Sample batch of training data from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Obervations, with shape :math:`(X, N, O^*)` where X is the number of tasks. N is batch size. torch.Tensor: Actions, with shape :math:`(X, N, A^*)`. torch.Tensor: Rewards, with shape :math:`(X, N, 1)`. torch.Tensor: Next obervations, with shape :math:`(X, N, O^*)`. torch.Tensor: Dones, with shape :math:`(X, N, 1)`. """ # transitions sampled randomly from replay buffer initialized = False for idx in indices: batch = self._replay_buffers[idx].sample_transitions( self._batch_size) if not initialized: o = batch['observations'][np.newaxis] a = batch['actions'][np.newaxis] r = batch['rewards'][np.newaxis] no = batch['next_observations'][np.newaxis] d = batch['dones'][np.newaxis] initialized = True else: o = np.vstack((o, batch['observations'][np.newaxis])) a = np.vstack((a, batch['actions'][np.newaxis])) r = np.vstack((r, batch['rewards'][np.newaxis])) no = np.vstack((no, batch['next_observations'][np.newaxis])) d = np.vstack((d, batch['dones'][np.newaxis])) o = torch.as_tensor(o, device=global_device()).float() a = torch.as_tensor(a, device=global_device()).float() r = torch.as_tensor(r, device=global_device()).float() no = torch.as_tensor(no, device=global_device()).float() d = torch.as_tensor(d, device=global_device()).float() return o, a, r, no, d def _sample_context(self, indices): """Sample batch of context from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Context data, with shape :math:`(X, N, C)`. X is the number of tasks. N is batch size. C is the combined size of observation, action, reward, and next observation if next observation is used in context. Otherwise, C is the combined size of observation, action, and reward. """ # make method work given a single task index if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: batch = self._context_replay_buffers[idx].sample_transitions( self._embedding_batch_size) o = batch['observations'] a = batch['actions'] r = batch['rewards'] context = np.hstack((np.hstack((o, a)), r)) if self._use_next_obs_in_context: context = np.hstack((context, batch['next_observations'])) if not initialized: final_context = context[np.newaxis] initialized = True else: final_context = np.vstack((final_context, context[np.newaxis])) final_context = torch.as_tensor(final_context, device=global_device()).float() if len(indices) == 1: final_context = final_context.unsqueeze(0) return final_context def _update_target_network(self): """Update parameters in the target vf network.""" for target_param, param in zip(self.target_vf.parameters(), self._vf.parameters()): target_param.data.copy_(target_param.data * (1.0 - self._soft_target_tau) + param.data * self._soft_target_tau) @property def policy(self): """Return all the policy within the model. Returns: metarl.torch.policies.Policy: Policy within the model. """ return self._policy @property def networks(self): """Return all the networks within the model. Returns: list: A list of networks. """ return self._policy.networks + [self._policy] + [ self._qf1, self._qf2, self._vf, self.target_vf ] def get_exploration_policy(self): """Return a policy used before adaptation to a specific task. Each time it is retrieved, this policy should only be evaluated in one task. Returns: metarl.Policy: The policy used to obtain samples that are later used for meta-RL adaptation. """ return self._policy def adapt_policy(self, exploration_policy, exploration_trajectories): """Produce a policy adapted for a task. Args: exploration_policy (metarl.Policy): A policy which was returned from get_exploration_policy(), and which generated exploration_trajectories by interacting with an environment. The caller may not use this object after passing it into this method. exploration_trajectories (metarl.TrajectoryBatch): Trajectories to adapt to, generated by exploration_policy exploring the environment. Returns: metarl.Policy: A policy adapted to the task represented by the exploration_trajectories. """ total_steps = sum(exploration_trajectories.lengths) o = exploration_trajectories.observations a = exploration_trajectories.actions r = exploration_trajectories.rewards.reshape(total_steps, 1) ctxt = np.hstack((o, a, r)).reshape(1, total_steps, -1) context = torch.as_tensor(ctxt, device=global_device()).float() self._policy.infer_posterior(context) return self._policy def to(self, device=None): """Put all the networks within the model on device. Args: device (str): ID of GPU or CPU. """ device = device or global_device() for net in self.networks: net.to(device) @classmethod def augment_env_spec(cls, env_spec, latent_dim): """Augment environment by a size of latent dimension. Args: env_spec (metarl.envs.EnvSpec): Environment specs to be augmented. latent_dim (int): Latent dimension. Returns: metarl.envs.EnvSpec: Augmented environment specs. """ obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) aug_obs = akro.Box(low=-1, high=1, shape=(obs_dim + latent_dim, ), dtype=np.float32) aug_act = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32) return EnvSpec(aug_obs, aug_act) @classmethod def get_env_spec(cls, env_spec, latent_dim, module): """Get environment specs of encoder with latent dimension. Args: env_spec (metarl.envs.EnvSpec): Environment specs. latent_dim (int): Latent dimension. module (str): Module to get environment specs for. Returns: metarl.envs.InOutSpec: Module environment specs with latent dimension. """ obs_dim = int(np.prod(env_spec.observation_space.shape)) action_dim = int(np.prod(env_spec.action_space.shape)) if module == 'encoder': in_dim = obs_dim + action_dim + 1 out_dim = latent_dim * 2 elif module == 'vf': in_dim = obs_dim out_dim = latent_dim in_space = akro.Box(low=-1, high=1, shape=(in_dim, ), dtype=np.float32) out_space = akro.Box(low=-1, high=1, shape=(out_dim, ), dtype=np.float32) if module == 'encoder': spec = InOutSpec(in_space, out_space) elif module == 'vf': spec = EnvSpec(in_space, out_space) return spec