def add_sample(self, observation, action, label, valid): # labes: label_num x label_dim self._observations[self._top] = np_ify(observation) self._actions[self._top] = np_ify(action) self._labels[self._top] = np_ify(label) self._valids[self._top] = np_ify(valid) self._advance()
def get_action(self, obs, labels=None, deterministic=False): assert len(obs.shape) == 1 assert (self.policy.a_p == self.sup_learner.a_p).all() with torch.no_grad(): obs_action = (torch_ify(obs)[None, None, :], self.policy.a_p[None, None, :]) if labels is not None: labels = torch_ify(labels)[None, None, :] pis, info = self.forward(obs_action, labels=labels, latent=self.policy.latent_p, sup_latent=self.sup_learner.latent_p, return_info=True) sup_probs = Categorical(logits=info['sup_preactivation']).probs pis = np_ify(pis[0, 0, :]) sup_probs = np_ify(sup_probs[0, 0, :, :]) if deterministic: action = np.argmax(pis) else: action = np.random.choice(np.arange(pis.shape[0]), p=pis) self.policy.a_p = torch_ify(np.array([action])) self.policy.latent_p = info['latent'] self.sup_learner.a_p = torch_ify(np.array([action])) self.sup_learner.latent_p = info['sup_latent'] return action, {'intentions': sup_probs}
def train_from_torch(self, batch): rewards = batch['rewards'] # terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] # in order to bootstrap the models, we need to train one network only per batch net_idx = self._n_train_steps_total % len(self.model._nets) mean, logvar, predcted_rewards = self.model.forward( obs, actions, network_idx=net_idx, return_net_outputs=True) # TODO: possibly need to include weight decay mean_mse = self.mean_criterion(mean, next_obs) model_loss = self.model_criterion(mean, logvar, next_obs) bound_loss = self.model.bound_loss() if self.reward_criterion: reward_loss = self.reward_criterion(predcted_rewards, rewards) else: reward_loss = 0 loss = model_loss + bound_loss + reward_loss self.model_optimizer.zero_grad() loss.backward() self.model_optimizer.step() self.model.trained_at_all = True if self._need_to_update_eval_statistics: self._need_to_update_eval_statistics = False self.eval_statistics['Model Loss'] = np_ify(model_loss) self.eval_statistics['Bound Loss'] = np_ify(bound_loss) self.eval_statistics['Reward Loss'] = np_ify(reward_loss) self.eval_statistics['Model MSE'] = np_ify(mean_mse) self.eval_statistics['Loss'] = np_ify(loss) self._n_train_steps_total += 1
def get_action(self, obs, deterministic=False): assert len(obs.shape) == 1 with torch.no_grad(): obs_action = (torch_ify(obs)[None,None,:], self.a_p[None,None,:]) pis, info = self.forward(obs_action, latent=self.latent_p, return_info=True) sup_probs = self.sup_prob(obs_action, latent=self.latent_p) pis = np_ify(pis[0,0,:]) sup_probs = np_ify(sup_probs[0,0,:,:]) if deterministic: action = np.argmax(pis) else: action = np.random.choice(np.arange(pis.shape[0]),p=pis) self.a_p = torch_ify(np.array([action])) self.latent_p = info['latent'] return action, {'intentions': sup_probs}
def forward(self, obs, valid_musk=None): # x: (batch*num_node) x output_dim # edge_index: 2 x node_edge # messages from nodes in edge_index[0] are sent to nodes in edge_index[1] batch_size, node_num, obs_dim = obs.shape x = torch.zeros(batch_size, self.node_num, self.output_dim).to(ptu.device) x[:, :, :self.input_dim] = obs x[:, 0, self.input_dim:] = self.ego_init[None, :] x[:, 1:, self.input_dim:] = self.other_init[None, None, :] x = x.reshape(int(batch_size * self.node_num), self.output_dim) # xs = obs[:,:,0] # ys = obs[:,:,1] # upper_indices = torch.where(ys > 4.) # lower_indices = torch.where((ys > 0.) and (ys <= 4.)) obs = np_ify(obs) edge_index = get_edge_index(obs) #batch x 2 x max_edge_num edge_index = np.swapaxes(edge_index, 0, 1).reshape(2, -1) edge_index = np.unique(edge_index, axis=1) edge_index = torch_ify(edge_index).long() edge_index = pyg_utils.remove_self_loops(edge_index)[0] return x, edge_index
def _start_new_rollout(self): self.exploration_policy.reset() # Note: we assume we're using a silent env. o = self.training_env.reset() rgp = self.rollout_goal_params if rgp is None: self._rollout_goal = o[self.desired_goal_key] elif rgp["strategy"] == "ensemble_qs": exploration_temperature = rgp["exploration_temperature"] assert len(self.ensemble_qs) > 0 N = 128 obs = np.tile(o[self.observation_key], (N, 1)) proposed_goals = self.training_env.sample_goals(N)[ self.desired_goal_key] new_obs = np.hstack((obs, proposed_goals)) actions = torch_ify(self.policy.get_action(new_obs)[0]) q_values = np.zeros((len(self.ensemble_qs), N)) for i, q in enumerate(self.ensemble_qs): q_values[i, :] = np_ify(q(torch_ify(new_obs), actions)).flatten() q_std = q_values.std(axis=0) p = softmax(q_std / exploration_temperature) ind = np.random.choice(np.arange(N), p=p) self._rollout_goal = {} self._rollout_goal[self.desired_goal_key] = proposed_goals[ind, :] elif rgp["strategy"] == "vae_q": pass else: assert False, "bad rollout goal strategy" return o
def beta_eval(goals): # goals = np.array([[ # *goal # ]]) N = len(goals) observations = np.tile(obs, (N, 1)) new_obs = np.hstack((observations, goals)) actions = torch_ify(policy.get_action(new_obs)[0]) return np_ify(q(torch_ify(new_obs), actions)).flatten()
def add_advantages(self, path, path_len, flag): if flag: next_vf = self.vf(torch_ify(path["next_observations"])) cur_vf = self.vf(torch_ify(path["observations"])) rewards = torch_ify(path["rewards"]) term = (1 - torch_ify(path["terminals"].astype(np.float32))) delta = rewards + term * self.discount * next_vf - cur_vf advantages = torch.zeros((path_len)) returns = torch.zeros((path_len)) gae = 0 R = 0 for i in reversed(range(path_len)): advantages[i] = delta[i] + term[i] * (self.discount * self.gae_lambda) * gae gae = advantages[i] returns[i] = rewards[i] + term[i] * self.discount * R R = returns[i] advantages = np_ify(advantages) if advantages.std() != 0.0: advantages = (advantages - advantages.mean()) / advantages.std() else: advantages = (advantages - advantages.mean()) returns = np_ify(returns) else: advantages = np.zeros(path_len) returns = np.zeros(path_len) return dict(observations=path["observations"], actions=path["actions"], rewards=path["rewards"], next_observations=path["next_observations"], terminals=path["terminals"], agent_infos=path["agent_infos"], env_infos=path["env_infos"], advantages=advantages, returns=returns)
def _cost_function(self, ac_seqs): ''' a function from action sequence to cost, either from the model or the given cost function. TODO: add the sampling strategies from the PETS paper ac_seqs: batch_size * (cem_horizon * action_dimension) requires self.current_obs to be accurately set ''' batch_size = ac_seqs.shape[0] ac_seqs = ac_seqs.reshape( (batch_size, self.cem_horizon, self.action_dim)) obs = np.tile(self.current_obs, reps=(batch_size * self.num_particles, 1)) ac_seqs = np.tile(ac_seqs[:, np.newaxis, :, :], reps=(1, self.num_particles, 1, 1)) ac_seqs = ac_seqs.reshape((batch_size * self.num_particles, self.cem_horizon, self.action_dim)) observations, rewards = self.model.unroll(obs, ac_seqs, self.sampling_strategy) rewards = np_ify(rewards).reshape( (batch_size, self.num_particles, self.cem_horizon)) # sum over time, average over particles # TODO (maybe): add discounting return -rewards.sum(axis=(2)).mean(axis=(1))
def get_actions(self, obs, deterministic=False): outputs = self.forward(obs, deterministic=deterministic)[0] return np_ify(outputs)
def get_actions(self, obs): outputs = self.forward(obs)[0] return np_ify(outputs)
def get_actions(self, obs, deterministic=False): obs = torch.unsqueeze(obs, 0) outputs = self.forward(obs, deterministic=deterministic)[0] return np_ify(outputs)
def train_from_torch(self, batch): rewards = batch['rewards'] * self.reward_scale terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] if self.prioritized_replay: indices = batch['indices'] importance_weights = batch['importance_weights'] """ Compute loss """ if self.double_dqn: best_action_idxs = self.qf(next_obs).max(1, keepdim=True)[1] target_q_values = self.target_qf(next_obs).gather( 1, best_action_idxs).detach() else: target_q_values = self.target_qf(next_obs).detach().max( 1, keepdim=True)[0] y_target = rewards + (1. - terminals) * self.discount * target_q_values y_target = y_target.detach() # actions is a one-hot vector y_pred = torch.sum(self.qf(obs) * actions, dim=1, keepdim=True) if self.prioritized_replay: td_errors = y_pred - y_target importance_weights = importance_weights.reshape(y_pred.shape[0], 1) # print(torch.max(importance_weights),torch.min(importance_weights),torch.mean(importance_weights)) qf_loss = self.qf_criterion(importance_weights * y_pred, importance_weights * y_target) td_errors = td_errors.reshape(y_pred.shape[0]) self.replay_buffer.update_priority( np_ify(indices).astype(int), np_ify(td_errors)) else: qf_loss = self.qf_criterion(y_pred, y_target) """ Soft target network updates """ self.qf_optimizer.zero_grad() qf_loss.backward() if self.clip_gradient > 0.: nn.utils.clip_grad_norm_(self.qf.parameters(), self.clip_gradient) qf_grad_norm = torch.tensor(0.).to(ptu.device) for p in self.qf.parameters(): param_norm = p.grad.data.norm(2) qf_grad_norm += param_norm.item()**2 qf_grad_norm = qf_grad_norm**(1. / 2) self.qf_optimizer.step() """ Soft Updates """ if self._n_train_steps_total % self.target_update_period == 0: ptu.soft_update_from_to(self.qf, self.target_qf, self.soft_target_tau) """ Save some statistics for eval using just one batch. """ if self._need_to_update_eval_statistics: self._need_to_update_eval_statistics = False self.eval_statistics['QF Loss'] = np.mean(ptu.get_numpy(qf_loss)) self.eval_statistics.update( create_stats_ordered_dict( 'Y Predictions', ptu.get_numpy(y_pred), )) self.eval_statistics['QF Gradient'] = np.mean( ptu.get_numpy(qf_grad_norm))