def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask = self._unpack_observation(obs_batch) # Compute actions with th.no_grad(): q_values, hiddens = _mac( self.model, th.from_numpy(obs_batch), [th.from_numpy(np.array(s)) for s in state_batches]) avail = th.from_numpy(action_mask).float() masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = actions.numpy() hiddens = [s.numpy() for s in hiddens] return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
def sample(self): # first, sample a1 a1_dist = self._a1_distribution() a1 = a1_dist.sample() # sample a2 conditioned on a1 a2_dist = self._a2_distribution(a1) a2 = a2_dist.sample() self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) # return the action tuple return TupleActions([a1, a2])
def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask, _ = self._unpack_observation(obs_batch) # We need to ensure we do not use the env global state # to compute actions # Compute actions with th.no_grad(): q_values, hiddens = _mac( self.model, th.as_tensor(obs_batch, dtype=th.float, device=self.device), [ th.as_tensor( np.array(s), dtype=th.float, device=self.device) for s in state_batches ]) avail = th.as_tensor(action_mask, dtype=th.float, device=self.device) masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.argmax(dim=2)) actions = actions.cpu().numpy() hiddens = [s.cpu().numpy() for s in hiddens] return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
def sample(self): return TupleActions([s.sample() for s in self.child_distributions])
def sample(self): samples = [d.sample() for d in self._distributions] # print('samples:', samples) self._last_sample_logp = self._logp(samples) return TupleActions(samples)