def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None, info_batch=None, episodes=None, **kwargs): obs_batch, action_mask = self._unpack_observation(obs_batch) assert len(state_batches) == self.n_agents, state_batches state_batches = np.stack(state_batches, axis=1) # Compute actions with th.no_grad(): q_values, hiddens = _mac(self.model, th.from_numpy(obs_batch), th.from_numpy(state_batches)) avail = th.from_numpy(action_mask).float() masked_q_values = q_values.clone() masked_q_values[avail == 0.0] = -float("inf") # epsilon-greedy action selector random_numbers = th.rand_like(q_values[:, :, 0]) pick_random = (random_numbers < self.cur_epsilon).long() random_actions = Categorical(avail).sample().long() actions = (pick_random * random_actions + (1 - pick_random) * masked_q_values.max(dim=2)[1]) actions = var_to_np(actions) hiddens = var_to_np(hiddens) return (TupleActions(list(actions.transpose([1, 0]))), hiddens.transpose([1, 0, 2]), {})
def compute_actions(self, obs, state, is_training=False): assert not state, "RNN not supported" with self.lock: ob = torch.from_numpy(np.array(obs)).float() logits, values = self._model(ob) samples = F.softmax(logits, dim=1).multinomial(1).squeeze(0) return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
def compute_action(self, ob, *args): """Should take in a SINGLE ob""" with self.lock: ob = Variable(torch.from_numpy(ob).float().unsqueeze(0)) logits, values = self._model(ob) samples = self._model.probs(logits).multinomial().squeeze() values = values.squeeze(0) return var_to_np(samples), {"value": var_to_np(values)}
def compute(self, ob, *args): """Should take in a SINGLE ob""" with self.lock: ob = Variable(torch.from_numpy(ob).float().unsqueeze(0)) logits, values = self._model(ob) samples = self._model.probs(logits).multinomial().squeeze() values = values.squeeze(0) return var_to_np(samples), {"vf_preds": var_to_np(values)}
def compute(self, ob, *args): """Should take in a SINGLE ob""" with self.lock: ob = torch.from_numpy(ob).float().unsqueeze(0) logits, values = self._model(ob) # TODO(alok): Support non-categorical distributions. Multinomial # is only for categorical. sampled_actions = F.softmax(logits, dim=1).multinomial(1).squeeze() values = values.squeeze() return var_to_np(sampled_actions), {"vf_preds": var_to_np(values)}
def _value(self, obs): with self.lock: obs = torch.from_numpy(obs).float().unsqueeze(0) res = self.model.hidden_layers(obs) res = self.model.value_branch(res) res = res.squeeze() return var_to_np(res)
def value(self, ob, *args): with self.lock: ob = Variable(torch.from_numpy(ob).float().unsqueeze(0)) res = self._model.hidden_layers(ob) res = self._model.value_branch(res) res = res.squeeze(0) return var_to_np(res)
def value(self, ob, *args): with self.lock: ob = Variable(torch.from_numpy(ob).float().unsqueeze(0)) res = self._model.hidden_layers(ob) res = self._model.value_branch(res) res = res.squeeze(0) return var_to_np(res)
def compute_actions( self, obs_batch, state_batches=None, is_training=False): if state_batches: raise NotImplementedError("Torch RNN support") with self.lock: with torch.no_grad(): ob = torch.from_numpy(np.array(obs_batch)).float() model_out = self._model(ob) logits = model_out[0] # assume the first output is the logits actions = F.softmax(logits, dim=1).multinomial(1).squeeze(0) return var_to_np(actions), [], self.extra_action_out(model_out)
def compute_gradients(self, postprocessed_batch): with self.lock: loss_in = [] for key in self._loss_inputs: loss_in.append(torch.from_numpy(postprocessed_batch[key])) loss_out = self._loss(*loss_in) self._optimizer.zero_grad() loss_out.backward() # Note that return values are just references; # calling zero_grad will modify the values grads = [var_to_np(p.grad.data) for p in self._model.parameters()] return grads, {}
def extra_action_out(self, model_out): return {"vf_preds": var_to_np(model_out[1])}
def compute_logits(self, ob, *args): with self.lock: ob = Variable(torch.from_numpy(ob).float().unsqueeze(0)) res = self._model.hidden_layers(ob) return var_to_np(self._model.logits(res))
def compute_logits(self, ob, *args): with self.lock: ob = Variable(torch.from_numpy(ob).float().unsqueeze(0)) res = self._model.hidden_layers(ob) return var_to_np(self._model.logits(res))