def update_context(self, timestep): # append a single timestep [o, a, r, no] to the context ''' [[[o1, o2, o3, .., a1, a2, ...], -> timestep = 1 [o1, o2, o3, .., a1, a2, ...]]] -> timestep = 2 ''' o = torch.as_tensor(timestep.state[None, None, ...], device=tu.global_device()).float() a = torch.as_tensor(timestep.action[None, None, ...], device=tu.global_device()).float() r = torch.as_tensor(np.array([timestep.env_reward])[None, None, ...], device=tu.global_device()).float() s = torch.as_tensor(np.array([timestep.skill])[None, None, ...], device=tu.global_device()).float() no = torch.as_tensor(timestep.next_state[None, None, ...], device=tu.global_device()).float() if self._use_next_obs: data = torch.cat([o, a, r, s, no], dim=2) else: data = torch.cat([o, a, r, s], dim=2) if self._context is None: self._context = data else: self._context = torch.cat([self._context, data], dim=1)
def update_context(self, timestep): """Append single transition to the current context. Args: timestep (garage._dtypes.TimeStep): Timestep containing transition information to be added to context. """ o = torch.as_tensor(timestep.observation[None, None, ...], device=tu.global_device()).float() a = torch.as_tensor(timestep.action[None, None, ...], device=tu.global_device()).float() r = torch.as_tensor(np.array([timestep.reward])[None, None, ...], device=tu.global_device()).float() no = torch.as_tensor(timestep.next_observation[None, None, ...], device=tu.global_device()).float() if self._use_next_obs: data = torch.cat([o, a, r, no], dim=2) else: data = torch.cat([o, a, r], dim=2) if self._context is None: self._context = data else: self._context = torch.cat([self._context, data], dim=1)
def _skills_reason_optimize_policy(self): self._controller.reset_belief() # data shape is (task, batch, feat) obs, actions, rewards, skills, next_obs, terms, context = self.\ _sample_skill_path() # skills_pred is distribution policy_outputs, skills_pred, task_z = self._controller(obs, context) _, policy_mean, policy_log_std, policy_log_pi = policy_outputs[:4] self.context_optimizer.zero_grad() if self._use_information_bottleneck: kl_div = self._controller.compute_kl_div() kl_loss = self._kl_lambda * kl_div kl_loss.backward(retain_graph=True) skills_target = skills.clone().detach().requires_grad_(True)\ .to(tu.global_device()) skills_pred = skills_pred.to(tu.global_device()) policy_loss = F.mse_loss(skills_pred.flatten(), skills_target.flatten())\ * self._skills_reason_reward_scale mean_reg_loss = self._policy_mean_reg_coeff * (policy_mean**2).mean() std_reg_loss = self._policy_std_reg_coeff * (policy_log_std**2).mean() #took away the pre-activation reg term policy_reg_loss = mean_reg_loss + std_reg_loss policy_loss = policy_loss + policy_reg_loss self._controller_optimizer.zero_grad() policy_loss.backward() self._controller_optimizer.step()
def forward(self, states, actions, skills): """Return Q-value(s).""" if not isinstance(states, torch.Tensor): states = torch.from_numpy(states).float().to(tu.global_device()) if not isinstance(actions, torch.Tensor): actions = torch.from_numpy(actions).float().to(tu.global_device()) if not isinstance(skills, torch.Tensor): skills = torch.from_numpy(skills).float().to(tu.global_device()) return super().forward(torch.cat([states, skills, actions], 1))
def test_utils_set_gpu_mode(): """Test setting gpu mode to False to force CPU.""" if torch.cuda.is_available(): tu.set_gpu_mode(mode=True) assert tu.global_device() == torch.device('cuda:0') assert tu._USE_GPU else: tu.set_gpu_mode(mode=False) assert tu.global_device() == torch.device('cpu') assert not tu._USE_GPU assert not tu._GPU_ID
def forward(self, states, skills): if not isinstance(states, torch.Tensor): states = torch.from_numpy(states).float().to(tu.global_device()) if len(states.shape) == 1: states = states.unsqueeze(0) if not isinstance(skills, torch.Tensor): skills = torch.from_numpy(skills).float().to(tu.global_device()) if len(skills.shape) == 1: skills = skills.unsqueeze(0) states = states.to(tu.global_device()) skills = skills.to(tu.global_device()) # print("in tanh_gaussian_mlp_policy") # print(states.size()) # print(skills.size()) return super().forward(torch.cat((states, skills), 1))
def get_actions(self, observations): r"""Get actions given observations. Args: observations (np.ndarray): Observations from the environment. Shape is :math:`batch_dim \bullet env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted actions. :math:`batch_dim \bullet env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution. * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ with torch.no_grad(): if not isinstance(observations, torch.Tensor): observations = torch.as_tensor(observations).float().to( tu.global_device()) dist = self.forward(torch.Tensor(observations)) return (dist.rsample().numpy(), dict(mean=dist.mean.numpy(), log_std=(dist.variance.sqrt()).log().numpy()))
def _sample_path_context(self, indices): if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: path = self._context_replay_buffers[idx].sample_path() o = path['states'] a = path['actions'] r = path['env_rewards'] z = path['skills_onehot'] context = np.hstack((np.hstack((np.hstack((o, a)), r)), z)) if self._use_next_obs_in_context: context = np.hstack((context, path['states'])) if not initialized: final_context = context[np.newaxis] initialized = True else: final_context = np.vstack((final_context, context[np.newaxis])) final_context = torch.as_tensor(final_context, device=tu.global_device()).float() if len(indices) == 1: final_context = final_context.unsqueeze(0) return final_context
def adapt_policy(self, exploration_policy, exploration_trajectories): """Produce a policy adapted for a task. Args: exploration_policy (garage.Policy): A policy which was returned from get_exploration_policy(), and which generated exploration_trajectories by interacting with an environment. The caller may not use this object after passing it into this method. exploration_trajectories (garage.TrajectoryBatch): Trajectories to adapt to, generated by exploration_policy exploring the environment. Returns: garage.Policy: A policy adapted to the task represented by the exploration_trajectories. """ total_steps = sum(exploration_trajectories.lengths) o = exploration_trajectories.observations a = exploration_trajectories.actions r = exploration_trajectories.rewards.reshape(total_steps, 1) ctxt = np.hstack((o, a, r)).reshape(1, total_steps, -1) context = torch.as_tensor(ctxt, device=tu.global_device()).float() self._policy.infer_posterior(context) return self._policy
def get_action(self, observation): r"""Get a single action given an observation. Args: observation (np.ndarray): Observation from the environment. Shape is :math:`env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted action. Shape is :math:`env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ with torch.no_grad(): if not isinstance(observation, torch.Tensor): observation = torch.as_tensor(observation).float().to( tu.global_device()) observation = torch.Tensor(observation).unsqueeze(0) dist = self.forward(observation) return (dist.rsample().squeeze(0).numpy(), dict(mean=dist.mean.squeeze(0).numpy(), log_std=(dist.variance**.5).log().squeeze(0).numpy()))
def get_action(self, obs): z = self.z obs = torch.as_tensor(obs[None], device=tu.global_device()).float() obs_in = torch.cat([obs, z], dim=1) skill_choice, info = self._controller_policy.get_action(obs_in) skill_z = torch.eye(self._num_skills)[skill_choice] action, _ = self._sub_actor.get_action(obs, skill_z) return action, skill_choice, info
def forward(self, states): if not isinstance(states, torch.Tensor): states = torch.from_numpy(states).float().to(tu.global_device()) # print("in forward") # print(states.size()) # states = torch.from_numpy(np.array([1, 2, 3])).float().to(tu.global_device()) x = super().forward(states) return torch.softmax(x, dim=-1)
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [GarageEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) tu.set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = tu.global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def _sample_skill_path(self): path = self._skills_replay_buffer.sample_path() # TODO: trim or extend batch to the same size o = path['states'] a = path['actions'] r = path['env_rewards'] z = path['skills_onehot'] context = np.hstack((np.hstack((np.hstack((o, a)), r)), z)) if self._use_next_obs_in_context: context = np.hstack((context, path['next_states'])) context = context[np.newaxis] o = path['states'][np.newaxis] a = path['actions'][np.newaxis] r = path['env_rewards'][np.newaxis] z = path['skills_onehot'][np.newaxis] no = path['next_states'][np.newaxis] d = path['dones'][np.newaxis] o = torch.as_tensor(o, device=tu.global_device()).float() a = torch.as_tensor(a, device=tu.global_device()).float() r = torch.as_tensor(r, device=tu.global_device()).float() z = torch.as_tensor(z, device=tu.global_device()).float() no = torch.as_tensor(no, device=tu.global_device()).float() d = torch.as_tensor(d, device=tu.global_device()).float() context = torch.as_tensor(context, device=tu.global_device()).float() context = context.unsqueeze(0) return o, a, r, z, no, d, context
def get_actions(self, states): with torch.no_grad(): if not isinstance(states, torch.Tensor): states = torch.from_numpy(states).float().to( tu.global_device()) states = states.to(tu.global_device()) dist = self.forward(states).to('cpu').detach() actions = np.array([np.random.choice(self._action_dim, p=dist.numpy()[idx]) for idx in range(dist.numpy().shape[0])]) ret_mean = np.mean(dist.numpy()) ret_log_std = np.log((np.std(dist.numpy()))) len = actions.shape[0] ret_log_pi = np.log(dist[np.arange(len), actions]) return (actions, dict(mean=ret_mean, log_std=ret_log_std, log_pi=ret_log_pi, dist=dist))
def get_action(self, state): with torch.no_grad(): if not isinstance(state, torch.Tensor): state = torch.from_numpy(state).float().to( tu.global_device()) state = state.to(tu.global_device()) dist = self.forward(state.unsqueeze(0)).squeeze(0).to('cpu').detach() # action = torch.tensor([torch.rsample()]) action = np.array([np.random.choice(self._action_dim, p=dist.squeeze(0).numpy())]) ret_mean = np.mean(dist.numpy()) ret_log_std = np.log((np.std(dist.numpy()))) ret_log_pi = np.log(dist[..., list(action)]) return (action, dict(mean=ret_mean, log_std=ret_log_std, log_pi=ret_log_pi, dist=dist))
def to(self, device=None): """Put all the networks within the model on device. Args: device (str): ID of GPU or CPU. """ device = device or tu.global_device() for net in self.networks: net.to(device)
def adapt_policy(self, exploration_policy, exploration_trajectories): total_steps = sum(exploration_trajectories.lengths) o = exploration_trajectories.states a = exploration_trajectories.actions r = exploration_trajectories.env_rewards.reshape(total_steps, 1) s = exploration_trajectories.skills_onehot ctxt = np.hstack((o, a, r, s)).reshape(1, total_steps, -1) context = torch.as_tensor(ctxt, device=tu.global_device()).float() self._controller.infer_posterior(context) return self._controller
def compute_kl_div(self): r"""Compute :math:`KL(q(z|c) \| p(z))`. Returns: float: :math:`KL(q(z|c) \| p(z))`. """ prior = torch.distributions.Normal( torch.zeros(self._latent_dim).to(tu.global_device()), torch.ones(self._latent_dim).to(tu.global_device())) posteriors = [ torch.distributions.Normal(mu, torch.sqrt(var)) for mu, var in zip( torch.unbind(self.z_means), torch.unbind(self.z_vars)) ] kl_divs = [ torch.distributions.kl.kl_divergence(post, prior) for post in posteriors ] kl_div_sum = torch.sum(torch.stack(kl_divs)) return kl_div_sum
def _sample_data(self, indices): """Sample batch of training data from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Obervations, with shape :math:`(X, N, O^*)` where X is the number of tasks. N is batch size. torch.Tensor: Actions, with shape :math:`(X, N, A^*)`. torch.Tensor: Rewards, with shape :math:`(X, N, 1)`. torch.Tensor: Next obervations, with shape :math:`(X, N, O^*)`. torch.Tensor: Dones, with shape :math:`(X, N, 1)`. """ # transitions sampled randomly from replay buffer initialized = False for idx in indices: batch = self._replay_buffers[idx].sample_transitions( self._batch_size) if not initialized: o = batch['observations'][np.newaxis] a = batch['actions'][np.newaxis] r = batch['rewards'][np.newaxis] no = batch['next_observations'][np.newaxis] d = batch['dones'][np.newaxis] initialized = True else: o = np.vstack((o, batch['observations'][np.newaxis])) a = np.vstack((a, batch['actions'][np.newaxis])) r = np.vstack((r, batch['rewards'][np.newaxis])) no = np.vstack((no, batch['next_observations'][np.newaxis])) d = np.vstack((d, batch['dones'][np.newaxis])) o = torch.as_tensor(o, device=tu.global_device()).float() a = torch.as_tensor(a, device=tu.global_device()).float() r = torch.as_tensor(r, device=tu.global_device()).float() no = torch.as_tensor(no, device=tu.global_device()).float() d = torch.as_tensor(d, device=tu.global_device()).float() return o, a, r, no, d
def _sample_task_path(self, indices): if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: path = self._replay_buffers[idx].sample_path() # TODO: trim or extend batch to the same size if not initialized: o = path['states'][np.newaxis] a = path['actions'][np.newaxis] r = path['env_rewards'][np.newaxis] z = path['skills_onehot'][np.newaxis] no = path['next_states'][np.newaxis] d = path['dones'][np.newaxis] initialized = True else: o = np.vstack((o, path['states'][np.newaxis])) a = np.vstack((a, path['actions'][np.newaxis])) r = np.vstack((r, path['env_rewards'][np.newaxis])) z = np.vstack((z, path['skills_onehot'][np.newaxis])) no = np.vstack((no, path['next_states'][np.newaxis])) d = np.vstack((d, path['dones'][np.newaxis])) o = torch.as_tensor(o, device=tu.global_device()).float() a = torch.as_tensor(a, device=tu.global_device()).float() r = torch.as_tensor(r, device=tu.global_device()).float() z = torch.as_tensor(z, device=tu.global_device()).float() no = torch.as_tensor(no, device=tu.global_device()).float() d = torch.as_tensor(d, device=tu.global_device()).float() return o, a, r, z, no, d
def _discriminator_objective(self, samples_data): states = samples_data['next_state'] discriminator_pred = self._discriminator(states) discriminator_target = (samples_data['skill'].type( torch.cuda.FloatTensor).requires_grad_(True)).to( tu.global_device()) discriminator_loss = torch.mean( F.cross_entropy(discriminator_pred, discriminator_target.flatten())) return discriminator_loss
def forward(self, observations, skills): if not isinstance(observations, torch.Tensor): observations = torch.from_numpy(observations).float().to( tu.global_device()) if len(observations.shape) == 1: observations = observations.unsqueeze(0) if not isinstance(skills, torch.Tensor): skills = torch.from_numpy(skills).float().to(tu.global_device()) if len(skills.shape) == 1: skills = skills.unsqueeze(0) input = torch.cat([observations, skills], dim=1).to(tu.global_device()) log_p_x_t, reg_loss_t, x_t, log_ws_t, mus_t, log_sigs_t = self.distribution.get_p_params( input) raw_actions = x_t.detach().cpu().numpy() actions = np.tanh(raw_actions) if self._squash else raw_actions return actions, dict(log_p_x_t=log_p_x_t, reg_loss_t=reg_loss_t, x_t=x_t, log_ws_t=log_ws_t, mus_t=mus_t, log_sigs_t=log_sigs_t)
def reset_belief(self, num_tasks=1): r"""Reset :math:`q(z \| c)` to the prior and sample a new z from the prior. Args: num_tasks (int): Number of tasks. """ # reset distribution over z to the prior mu = torch.zeros(num_tasks, self._latent_dim).to(tu.global_device()) if self._use_information_bottleneck: var = torch.ones(num_tasks, self._latent_dim).to(tu.global_device()) else: var = torch.zeros(num_tasks, self._latent_dim).to(tu.global_device()) self.z_means = mu self.z_vars = var # sample a new z from the prior self.sample_from_belief() # reset the context collected so far self._context = None # reset any hidden state in the encoder network (relevant for RNN) self._context_encoder.reset(num_tasks)
def get_p_params(self, input): log_ws_t, xz_mus_t, xz_log_sigs_t = self.get_p_xz_params(input) # (N x K), (N x K x Dx), (N x K x Dx) N = log_ws_t.shape[0] xz_sigs_t = torch.exp(xz_log_sigs_t) # Sample the latent code z_t = torch.multinomial(torch.exp(log_ws_t), num_samples=1) # N*1 # Choose mixture component corresponding to the latent mask_t = torch.eye(self._K)[z_t[:, 0]].to(tu.global_device()) mask_t = mask_t.ge(1) # turn into boolean xz_mu_t = torch.masked_select(xz_mus_t, mask_t) xz_sig_t = torch.masked_select(xz_sigs_t, mask_t) # Sample x x_t = xz_mu_t + xz_sig_t * torch.normal(mean=torch.zeros( (N, self._Dx)).to(tu.global_device()), std=1.0) if not self._reparameterize: x_t = x_t.detach().cpu().numpy() # log p(x|z) log_p_xz_t = self._create_log_gaussian(xz_mus_t, xz_log_sigs_t, x_t[:, None, :]) # N*K # log p(x) log_p_x_t = torch.logsumexp(log_p_xz_t + log_ws_t, dim=1) log_p_x_t -= torch.logsumexp(log_ws_t, dim=1) reg_loss_t = 0 reg_loss_t += self._reg * 0.5 * torch.mean(xz_log_sigs_t**2) reg_loss_t += self._reg * 0.5 * torch.mean(xz_mus_t**2) return log_p_x_t, reg_loss_t, x_t, log_ws_t, xz_mus_t, xz_log_sigs_t
def _sample_task_path(self, indices): if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: path = self._context_replay_buffers[idx].sample_path() # should be replay_buffers[] # TODO: trim or extend batch to the same size context_o = path['states'] context_a = path['actions'] context_r = path['env_rewards'] context_z = path['skills_onehot'] context = np.hstack((np.hstack((np.hstack( (context_o, context_a)), context_r)), context_z)) if self._use_next_obs_in_context: context = np.hstack((context, path['next_states'])) if not initialized: final_context = context[np.newaxis] o = path['states'][np.newaxis] a = path['actions'][np.newaxis] r = path['env_rewards'][np.newaxis] z = path['skills_onehot'][np.newaxis] no = path['next_states'][np.newaxis] d = path['dones'][np.newaxis] initialized = True else: # print(o.shape) # print(path['states'].shape) o = np.vstack((o, path['states'][np.newaxis])) a = np.vstack((a, path['actions'][np.newaxis])) r = np.vstack((r, path['env_rewards'][np.newaxis])) z = np.vstack((z, path['skills_onehot'][np.newaxis])) no = np.vstack((no, path['next_states'][np.newaxis])) d = np.vstack((d, path['dones'][np.newaxis])) final_context = np.vstack((final_context, context[np.newaxis])) o = torch.as_tensor(o, device=tu.global_device()).float() a = torch.as_tensor(a, device=tu.global_device()).float() r = torch.as_tensor(r, device=tu.global_device()).float() z = torch.as_tensor(z, device=tu.global_device()).float() no = torch.as_tensor(no, device=tu.global_device()).float() d = torch.as_tensor(d, device=tu.global_device()).float() final_context = torch.as_tensor(final_context, device=tu.global_device()).float() if len(indices) == 1: final_context = final_context.unsqueeze(0) return o, a, r, z, no, d, final_context
def to(self, device=None): """Put all the networks within the model on device. Args: device (str): ID of GPU or CPU. """ if device is None: device = tu.global_device() for net in self.networks: net.to(device) self.log_alpha = torch.Tensor([self._initial_log_entropy ]).to(device).requires_grad_() if self.use_automatic_entropy_tuning: self.alpha_optimizer = self._optimizer([self.log_alpha], lr=self.policy_lr)
def to(self, device=None): """Put all the networks within the model on device. Args: device (str): ID of GPU or CPU. """ super().to(device) if device is None: device = tu.global_device() if not self._use_automatic_entropy_tuning: self._log_alpha = torch.Tensor([self._fixed_alpha] * self._num_tasks).log().to(device) else: self._log_alpha = torch.Tensor( [self._initial_log_entropy] * self._num_tasks).to(device).requires_grad_() self._alpha_optimizer = self._optimizer([self._log_alpha], lr=self._policy_lr)
def _sample_context(self, indices): """Sample batch of context from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Context data, with shape :math:`(X, N, C)`. X is the number of tasks. N is batch size. C is the combined size of observation, action, reward, and next observation if next observation is used in context. Otherwise, C is the combined size of observation, action, and reward. """ # make method work given a single task index if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: batch = self._context_replay_buffers[idx].sample_transitions( self._embedding_batch_size) o = batch['observations'] a = batch['actions'] r = batch['rewards'] context = np.hstack((np.hstack((o, a)), r)) if self._use_next_obs_in_context: context = np.hstack((context, batch['next_observations'])) if not initialized: final_context = context[np.newaxis] initialized = True else: final_context = np.vstack((final_context, context[np.newaxis])) final_context = torch.as_tensor(final_context, device=tu.global_device()).float() if len(indices) == 1: final_context = final_context.unsqueeze(0) return final_context
def get_action(self, obs): """Sample action from the policy, conditioned on the task embedding. Args: obs (torch.Tensor): Observation values, with shape :math:`(1, O)`. O is the size of the flattened observation space. Returns: torch.Tensor: Output action value, with shape :math:`(1, A)`. A is the size of the flattened action space. dict: * np.ndarray[float]: Mean of the distribution. * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ z = self.z obs = torch.as_tensor(obs[None], device=tu.global_device()).float() obs_in = torch.cat([obs, z], dim=1) action, info = self._policy.get_action(obs_in) action = np.squeeze(action, axis=0) info['mean'] = np.squeeze(info['mean'], axis=0) return action, info