Ejemplo n.º 1
0
    def select_action(self, obs):
        if self.is_continuous:
            if self._share_net:
                mu, log_std, value = self.net(obs, rnncs=self.rnncs)  # [B, A]
                self.rnncs_ = self.net.get_rnncs()
            else:
                mu, log_std = self.actor(obs, rnncs=self.rnncs)  # [B, A]
                self.rnncs_ = self.actor.get_rnncs()
                value = self.critic(obs, rnncs=self.rnncs)  # [B, 1]
            dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
            action = dist.sample().clamp(-1, 1)  # [B, A]
            log_prob = dist.log_prob(action).unsqueeze(-1)  # [B, 1]
        else:
            if self._share_net:
                logits, value = self.net(obs, rnncs=self.rnncs)  # [B, A], [B, 1]
                self.rnncs_ = self.net.get_rnncs()
            else:
                logits = self.actor(obs, rnncs=self.rnncs)  # [B, A]
                self.rnncs_ = self.actor.get_rnncs()
                value = self.critic(obs, rnncs=self.rnncs)  # [B, 1]
            norm_dist = td.Categorical(logits=logits)
            action = norm_dist.sample()  # [B,]
            log_prob = norm_dist.log_prob(action).unsqueeze(-1)  # [B, 1]

        acts_info = Data(action=action,
                         value=value,
                         log_prob=log_prob + th.finfo().eps)
        if self.use_rnn:
            acts_info.update(rnncs=self.rnncs)
        return action, acts_info
Ejemplo n.º 2
0
    def select_action(self, obs):
        output = self.actor(obs, rnncs=self.rnncs)  # [B, A]
        self.rnncs_ = self.actor.get_rnncs()
        if self.is_continuous:
            mu, log_std = output  # [B, A]
            dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
            action = dist.sample().clamp(-1, 1)  # [B, A]
        else:
            logits = output  # [B, A]
            norm_dist = td.Categorical(logits=logits)
            action = norm_dist.sample()  # [B,]

        acts_info = Data(action=action)
        if self.use_rnn:
            acts_info.update(rnncs=self.rnncs)
        return action, acts_info
Ejemplo n.º 3
0
    def episode_step(self, obs: Data, env_rets: Data, begin_mask: np.ndarray):
        super().episode_step()
        if self._store:
            exps = Data(
                obs=obs,
                # [B, ] => [B, 1]
                reward=env_rets.reward[:, np.newaxis],
                obs_=env_rets.obs_fs,
                done=env_rets.done[:, np.newaxis],
                begin_mask=begin_mask)
            exps.update(self._acts_info)
            self._buffer.add({self._agent_id: exps})

        idxs = np.where(env_rets.done)[0]
        self._pre_act[idxs] = 0.
        self.rnncs = self.rnncs_
        if self.rnncs is not None:
            for k in self.rnncs.keys():
                self.rnncs[k][idxs] = 0.
Ejemplo n.º 4
0
 def select_action(self, obs):
     # [B, P], [B, P, A], [B, P]
     (q, pi, beta) = self.net(obs, rnncs=self.rnncs)
     self.rnncs_ = self.net.get_rnncs()
     options_onehot = F.one_hot(self.options,
                                self.options_num).float()  # [B, P]
     options_onehot_expanded = options_onehot.unsqueeze(-1)  # [B, P, 1]
     pi = (pi * options_onehot_expanded).sum(-2)  # [B, A]
     if self.is_continuous:
         mu = pi  # [B, A]
         log_std = self.log_std[self.options]  # [B, A]
         dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
         action = dist.sample().clamp(-1, 1)  # [B, A]
         log_prob = dist.log_prob(action).unsqueeze(-1)  # [B, 1]
     else:
         logits = pi  # [B, A]
         norm_dist = td.Categorical(logits=logits)
         action = norm_dist.sample()  # [B,]
         log_prob = norm_dist.log_prob(action).unsqueeze(-1)  # [B, 1]
     value = q_o = (q * options_onehot).sum(-1, keepdim=True)  # [B, 1]
     beta_adv = q_o - ((1 - self.eps) * q.max(-1, keepdim=True)[0] +
                       self.eps * q.mean(-1, keepdim=True))  # [B, 1]
     max_options = q.argmax(-1)  # [B, P] => [B, ]
     beta_probs = (beta * options_onehot).sum(-1)  # [B, P] => [B,]
     beta_dist = td.Bernoulli(probs=beta_probs)
     # <1 则不改变op, =1 则改变op
     new_options = th.where(beta_dist.sample() < 1, self.options,
                            max_options)
     self.new_options = th.where(self._done_mask, max_options, new_options)
     self.oc_mask = (self.new_options == self.options).float()
     acts_info = Data(
         action=action,
         value=value,
         log_prob=log_prob + th.finfo().eps,
         beta_advantage=beta_adv + self.dc,
         last_options=self.options,
         options=self.new_options,
         reward_offset=-((1 - self.oc_mask) * self.dc).unsqueeze(-1))
     if self.use_rnn:
         acts_info.update(rnncs=self.rnncs)
     return action, acts_info
Ejemplo n.º 5
0
 def select_action(self, obs):
     output = self.actor(obs, rnncs=self.rnncs)  # [B, A]
     self.rnncs_ = self.actor.get_rnncs()
     value = self.critic(obs, rnncs=self.rnncs)  # [B, 1]
     if self.is_continuous:
         mu, log_std = output  # [B, A]
         dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
         action = dist.sample().clamp(-1, 1)  # [B, A]
         log_prob = dist.log_prob(action).unsqueeze(-1)  # [B, 1]
     else:
         logits = output  # [B, A]
         logp_all = logits.log_softmax(-1)  # [B, A]
         norm_dist = td.Categorical(logits=logp_all)
         action = norm_dist.sample()  # [B,]
         log_prob = norm_dist.log_prob(action).unsqueeze(-1)  # [B, 1]
     acts_info = Data(action=action,
                      value=value,
                      log_prob=log_prob + th.finfo().eps)
     if self.use_rnn:
         acts_info.update(rnncs=self.rnncs)
     if self.is_continuous:
         acts_info.update(mu=mu, log_std=log_std)
     else:
         acts_info.update(logp_all=logp_all)
     return action, acts_info
Ejemplo n.º 6
0
    def get_obs(self, behavior_names=None, only_obs=False):
        """
        解析环境反馈的信息,将反馈信息分为四部分:向量、图像、奖励、done信号
        """
        behavior_names = behavior_names or self.behavior_names

        whole_done = np.full(self._n_copies, False)
        whole_info_max_step = np.full(self._n_copies, False)
        all_obs_fa, all_obs_fs = {}, {}
        all_reward = {}

        for bn in behavior_names:
            ps = []

            # TODO: optimize
            while True:
                ds, ts = self.env.get_steps(bn)
                if len(ts):
                    ps.append(ts)
                if len(ds) == self._n_copies:
                    break
                elif len(ds) == 0:
                    self.env.step(
                    )  # some of environments done, but some of not
                else:
                    raise ValueError(
                        f'agents number error. Expected 0 or {self._n_copies}, received {len(ds)}'
                    )

            obs_fs, reward = ds.obs, ds.reward
            obs_fa = deepcopy(obs_fs)
            done = np.full(self._n_copies, False)
            begin_mask = np.full(self._n_copies, False)
            info_max_step = np.full(self._n_copies, False)
            info_real_done = np.full(self._n_copies, False)

            for ts in ps:  # TODO: 有待优化
                _ids = ts.agent_id
                reward[_ids] = ts.reward
                info_max_step[_ids] = ts.interrupted  # 因为达到episode最大步数而终止的
                # 去掉因为max_step而done的,只记录因为失败/成功而done的
                info_real_done[_ids[~ts.interrupted]] = True
                done[_ids] = True
                begin_mask[_ids] = True
                # zip: vector, visual, ...
                for _obs, _tobs in zip(obs_fa, ts.obs):
                    _obs[_ids] = _tobs

            if self._real_done:
                done = np.array(info_real_done)

            _obs_fa = Data()
            _obs_fs = Data()
            if len(self._vector_idxs[bn]) > 0:
                _obs_fa.update(
                    vector={
                        f'vector_{i}': obs_fa[vi]
                        for i, vi in enumerate(self._vector_idxs[bn])
                    })
                _obs_fs.update(
                    vector={
                        f'vector_{i}': obs_fs[vi]
                        for i, vi in enumerate(self._vector_idxs[bn])
                    })

            if len(self._visual_idxs[bn]) > 0:
                _obs_fa.update(
                    visual={
                        f'visual_{i}': obs_fa[vi]
                        for i, vi in enumerate(self._visual_idxs[bn])
                    })
                _obs_fs.update(
                    visual={
                        f'visual_{i}': obs_fs[vi]
                        for i, vi in enumerate(self._visual_idxs[bn])
                    })

            all_obs_fa[bn] = _obs_fa
            all_obs_fs[bn] = _obs_fs
            all_reward[bn] = reward

        whole_done = np.logical_or(whole_done, done)
        whole_info_max_step = np.logical_or(whole_info_max_step, info_max_step)

        if only_obs:
            all_obs_fa.update({
                'global':
                Data(begin_mask=np.full((self._n_copies, 1), True))
            })
            return all_obs_fa
        else:
            rets = {}
            for bn in self.behavior_names:
                rets[bn] = Data(obs_fa=all_obs_fa[bn],
                                obs_fs=all_obs_fs[bn],
                                reward=all_reward[bn],
                                done=whole_done,
                                info=dict(max_step=whole_info_max_step))
            rets.update({'global':
                         Data(begin_mask=begin_mask[:, np.newaxis])})  # [B, 1]
            return rets