コード例 #1
0
ファイル: sup_replay_buffer.py プロジェクト: maxiaoba/rlkit
 def add_sample(self, observation, action, label, valid):
     # labes: label_num x label_dim
     self._observations[self._top] = np_ify(observation)
     self._actions[self._top] = np_ify(action)
     self._labels[self._top] = np_ify(label)
     self._valids[self._top] = np_ify(valid)
     self._advance()
コード例 #2
0
    def get_action(self, obs, labels=None, deterministic=False):
        assert len(obs.shape) == 1
        assert (self.policy.a_p == self.sup_learner.a_p).all()
        with torch.no_grad():
            obs_action = (torch_ify(obs)[None,
                                         None, :], self.policy.a_p[None,
                                                                   None, :])
            if labels is not None:
                labels = torch_ify(labels)[None, None, :]
            pis, info = self.forward(obs_action,
                                     labels=labels,
                                     latent=self.policy.latent_p,
                                     sup_latent=self.sup_learner.latent_p,
                                     return_info=True)
            sup_probs = Categorical(logits=info['sup_preactivation']).probs
        pis = np_ify(pis[0, 0, :])
        sup_probs = np_ify(sup_probs[0, 0, :, :])
        if deterministic:
            action = np.argmax(pis)
        else:
            action = np.random.choice(np.arange(pis.shape[0]), p=pis)
        self.policy.a_p = torch_ify(np.array([action]))
        self.policy.latent_p = info['latent']
        self.sup_learner.a_p = torch_ify(np.array([action]))
        self.sup_learner.latent_p = info['sup_latent']

        return action, {'intentions': sup_probs}
コード例 #3
0
ファイル: pets.py プロジェクト: fusion-ml/rlkit
    def train_from_torch(self, batch):
        rewards = batch['rewards']
        # terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']

        # in order to bootstrap the models, we need to train one network only per batch
        net_idx = self._n_train_steps_total % len(self.model._nets)
        mean, logvar, predcted_rewards = self.model.forward(
            obs, actions, network_idx=net_idx, return_net_outputs=True)
        # TODO: possibly need to include weight decay
        mean_mse = self.mean_criterion(mean, next_obs)

        model_loss = self.model_criterion(mean, logvar, next_obs)
        bound_loss = self.model.bound_loss()
        if self.reward_criterion:
            reward_loss = self.reward_criterion(predcted_rewards, rewards)
        else:
            reward_loss = 0
        loss = model_loss + bound_loss + reward_loss
        self.model_optimizer.zero_grad()
        loss.backward()
        self.model_optimizer.step()
        self.model.trained_at_all = True

        if self._need_to_update_eval_statistics:
            self._need_to_update_eval_statistics = False
            self.eval_statistics['Model Loss'] = np_ify(model_loss)
            self.eval_statistics['Bound Loss'] = np_ify(bound_loss)
            self.eval_statistics['Reward Loss'] = np_ify(reward_loss)
            self.eval_statistics['Model MSE'] = np_ify(mean_mse)
            self.eval_statistics['Loss'] = np_ify(loss)
        self._n_train_steps_total += 1
コード例 #4
0
 def get_action(self, obs, deterministic=False):
     assert len(obs.shape) == 1
     with torch.no_grad():
         obs_action = (torch_ify(obs)[None,None,:], self.a_p[None,None,:])
         pis, info = self.forward(obs_action, latent=self.latent_p, return_info=True)
         sup_probs = self.sup_prob(obs_action, latent=self.latent_p)
     pis = np_ify(pis[0,0,:])
     sup_probs = np_ify(sup_probs[0,0,:,:])
     if deterministic:
         action = np.argmax(pis)
     else:
         action = np.random.choice(np.arange(pis.shape[0]),p=pis)
     self.a_p = torch_ify(np.array([action]))
     self.latent_p = info['latent']
     return action, {'intentions': sup_probs}
コード例 #5
0
    def forward(self, obs, valid_musk=None):
        # x: (batch*num_node) x output_dim
        # edge_index: 2 x node_edge
        # messages from nodes in edge_index[0] are sent to nodes in edge_index[1]

        batch_size, node_num, obs_dim = obs.shape

        x = torch.zeros(batch_size, self.node_num,
                        self.output_dim).to(ptu.device)
        x[:, :, :self.input_dim] = obs
        x[:, 0, self.input_dim:] = self.ego_init[None, :]
        x[:, 1:, self.input_dim:] = self.other_init[None, None, :]
        x = x.reshape(int(batch_size * self.node_num), self.output_dim)

        # xs = obs[:,:,0]
        # ys = obs[:,:,1]
        # upper_indices = torch.where(ys > 4.)
        # lower_indices = torch.where((ys > 0.) and (ys <= 4.))
        obs = np_ify(obs)
        edge_index = get_edge_index(obs)  #batch x 2 x max_edge_num
        edge_index = np.swapaxes(edge_index, 0, 1).reshape(2, -1)
        edge_index = np.unique(edge_index, axis=1)
        edge_index = torch_ify(edge_index).long()
        edge_index = pyg_utils.remove_self_loops(edge_index)[0]

        return x, edge_index
コード例 #6
0
    def _start_new_rollout(self):
        self.exploration_policy.reset()
        # Note: we assume we're using a silent env.
        o = self.training_env.reset()

        rgp = self.rollout_goal_params
        if rgp is None:
            self._rollout_goal = o[self.desired_goal_key]
        elif rgp["strategy"] == "ensemble_qs":
            exploration_temperature = rgp["exploration_temperature"]
            assert len(self.ensemble_qs) > 0
            N = 128
            obs = np.tile(o[self.observation_key], (N, 1))
            proposed_goals = self.training_env.sample_goals(N)[
                self.desired_goal_key]
            new_obs = np.hstack((obs, proposed_goals))
            actions = torch_ify(self.policy.get_action(new_obs)[0])
            q_values = np.zeros((len(self.ensemble_qs), N))
            for i, q in enumerate(self.ensemble_qs):
                q_values[i, :] = np_ify(q(torch_ify(new_obs),
                                          actions)).flatten()
            q_std = q_values.std(axis=0)
            p = softmax(q_std / exploration_temperature)
            ind = np.random.choice(np.arange(N), p=p)
            self._rollout_goal = {}
            self._rollout_goal[self.desired_goal_key] = proposed_goals[ind, :]
        elif rgp["strategy"] == "vae_q":
            pass
        else:
            assert False, "bad rollout goal strategy"

        return o
コード例 #7
0
 def beta_eval(goals):
     # goals = np.array([[
     #     *goal
     # ]])
     N = len(goals)
     observations = np.tile(obs, (N, 1))
     new_obs = np.hstack((observations, goals))
     actions = torch_ify(policy.get_action(new_obs)[0])
     return np_ify(q(torch_ify(new_obs), actions)).flatten()
コード例 #8
0
ファイル: ppo_path_collector.py プロジェクト: naruya/DIAYN
    def add_advantages(self, path, path_len, flag):
        if flag:
            next_vf = self.vf(torch_ify(path["next_observations"]))
            cur_vf = self.vf(torch_ify(path["observations"]))
            rewards = torch_ify(path["rewards"])
            term = (1 - torch_ify(path["terminals"].astype(np.float32)))
            delta = rewards + term * self.discount * next_vf - cur_vf
            advantages = torch.zeros((path_len))
            returns = torch.zeros((path_len))
            gae = 0
            R = 0

            for i in reversed(range(path_len)):
                advantages[i] = delta[i] + term[i] * (self.discount *
                                                      self.gae_lambda) * gae
                gae = advantages[i]

                returns[i] = rewards[i] + term[i] * self.discount * R
                R = returns[i]

            advantages = np_ify(advantages)
            if advantages.std() != 0.0:
                advantages = (advantages -
                              advantages.mean()) / advantages.std()
            else:
                advantages = (advantages - advantages.mean())

            returns = np_ify(returns)
        else:
            advantages = np.zeros(path_len)
            returns = np.zeros(path_len)
        return dict(observations=path["observations"],
                    actions=path["actions"],
                    rewards=path["rewards"],
                    next_observations=path["next_observations"],
                    terminals=path["terminals"],
                    agent_infos=path["agent_infos"],
                    env_infos=path["env_infos"],
                    advantages=advantages,
                    returns=returns)
コード例 #9
0
ファイル: MPC.py プロジェクト: fusion-ml/rlkit
    def _cost_function(self, ac_seqs):
        '''
        a function from action sequence to cost, either from the model or the given
        cost function. TODO: add the sampling strategies from the PETS paper

        ac_seqs: batch_size * (cem_horizon * action_dimension)
        requires self.current_obs to be accurately set
        '''
        batch_size = ac_seqs.shape[0]
        ac_seqs = ac_seqs.reshape(
            (batch_size, self.cem_horizon, self.action_dim))
        obs = np.tile(self.current_obs,
                      reps=(batch_size * self.num_particles, 1))
        ac_seqs = np.tile(ac_seqs[:, np.newaxis, :, :],
                          reps=(1, self.num_particles, 1, 1))
        ac_seqs = ac_seqs.reshape((batch_size * self.num_particles,
                                   self.cem_horizon, self.action_dim))
        observations, rewards = self.model.unroll(obs, ac_seqs,
                                                  self.sampling_strategy)
        rewards = np_ify(rewards).reshape(
            (batch_size, self.num_particles, self.cem_horizon))
        # sum over time, average over particles
        # TODO (maybe): add discounting
        return -rewards.sum(axis=(2)).mean(axis=(1))
コード例 #10
0
 def get_actions(self, obs, deterministic=False):
     outputs = self.forward(obs, deterministic=deterministic)[0]
     return np_ify(outputs)
コード例 #11
0
 def get_actions(self, obs):
     outputs = self.forward(obs)[0]
     return np_ify(outputs)
コード例 #12
0
 def get_actions(self, obs, deterministic=False):
     obs = torch.unsqueeze(obs, 0)
     outputs = self.forward(obs, deterministic=deterministic)[0]
     return np_ify(outputs)
コード例 #13
0
 def train_from_torch(self, batch):
     rewards = batch['rewards'] * self.reward_scale
     terminals = batch['terminals']
     obs = batch['observations']
     actions = batch['actions']
     next_obs = batch['next_observations']
     if self.prioritized_replay:
         indices = batch['indices']
         importance_weights = batch['importance_weights']
     """
     Compute loss
     """
     if self.double_dqn:
         best_action_idxs = self.qf(next_obs).max(1, keepdim=True)[1]
         target_q_values = self.target_qf(next_obs).gather(
             1, best_action_idxs).detach()
     else:
         target_q_values = self.target_qf(next_obs).detach().max(
             1, keepdim=True)[0]
     y_target = rewards + (1. - terminals) * self.discount * target_q_values
     y_target = y_target.detach()
     # actions is a one-hot vector
     y_pred = torch.sum(self.qf(obs) * actions, dim=1, keepdim=True)
     if self.prioritized_replay:
         td_errors = y_pred - y_target
         importance_weights = importance_weights.reshape(y_pred.shape[0], 1)
         # print(torch.max(importance_weights),torch.min(importance_weights),torch.mean(importance_weights))
         qf_loss = self.qf_criterion(importance_weights * y_pred,
                                     importance_weights * y_target)
         td_errors = td_errors.reshape(y_pred.shape[0])
         self.replay_buffer.update_priority(
             np_ify(indices).astype(int), np_ify(td_errors))
     else:
         qf_loss = self.qf_criterion(y_pred, y_target)
     """
     Soft target network updates
     """
     self.qf_optimizer.zero_grad()
     qf_loss.backward()
     if self.clip_gradient > 0.:
         nn.utils.clip_grad_norm_(self.qf.parameters(), self.clip_gradient)
     qf_grad_norm = torch.tensor(0.).to(ptu.device)
     for p in self.qf.parameters():
         param_norm = p.grad.data.norm(2)
         qf_grad_norm += param_norm.item()**2
     qf_grad_norm = qf_grad_norm**(1. / 2)
     self.qf_optimizer.step()
     """
     Soft Updates
     """
     if self._n_train_steps_total % self.target_update_period == 0:
         ptu.soft_update_from_to(self.qf, self.target_qf,
                                 self.soft_target_tau)
     """
     Save some statistics for eval using just one batch.
     """
     if self._need_to_update_eval_statistics:
         self._need_to_update_eval_statistics = False
         self.eval_statistics['QF Loss'] = np.mean(ptu.get_numpy(qf_loss))
         self.eval_statistics.update(
             create_stats_ordered_dict(
                 'Y Predictions',
                 ptu.get_numpy(y_pred),
             ))
         self.eval_statistics['QF Gradient'] = np.mean(
             ptu.get_numpy(qf_grad_norm))