def model_forward(self, obs, batch_opt=False): """obs is dict. values of obs must in numpy, and first dim is batch dim""" #就算只有一个环境,返回的状态也会是1x2048,不需要unsqueeze model_input = obs.copy() #防止obs被改变,因为obs在外部还被保存了一次 for k in model_input: model_input[k] = toFloatTensor(model_input[k], self.gpu_id) out = self.model.forward(model_input) return out
def model_forward(self, obs, batch_opt=False): """obs is dict. values of obs must in numpy, and first dim is batch dim""" #TODO 需要unsqueeze,或者重新封装一下单环境 model_input = obs.copy() #防止obs被改变,因为obs在外部还被保存了一次 for k in model_input: model_input[k] = toFloatTensor(model_input[k], self.gpu_id) if not batch_opt: model_input[k].unsqueeze_(0) out = self.model.forward(model_input) return out
def model_forward(self, obs, batch_opt = False, params = None): model_input = obs.copy() for k in model_input: model_input[k] = toFloatTensor(model_input[k], self.gpu_id) if not batch_opt: model_input[k].unsqueeze_(0) model_input['hidden'] = self.hidden model_input['action_probs'] = self.probs out = self.model.forward(model_input, params) out['prob'] = F.softmax(out['policy'], dim = 1) out['log_prob'] = F.log_softmax(out['policy'], dim = 1) out['entropy'] = (-out['log_prob'] * out['prob']).sum(1) return out
def model_forward(self, obs, batch_opt=False): """obs is dict. values of obs must in numpy, and first dim is batch dim""" #就算只有一个环境,返回的状态也会是1x2048,不需要unsqueeze model_input = obs.copy() #防止obs被改变,因为obs在外部还被保存了一次 if batch_opt: model_input['hidden'] = (torch.cat(self.hidden_batch[0][:-1]), torch.cat(self.hidden_batch[1][:-1])) model_input['action_probs'] = torch.cat(self.probs_batch[:-1]) else: model_input['hidden'] = (self.hidden_batch[0][-1], self.hidden_batch[1][-1]) model_input['action_probs'] = self.probs_batch[-1] for k in model_input: model_input[k] = toFloatTensor(model_input[k], self.gpu_id) out = self.model.forward(model_input) return out
def get_pi_v(self, env_state): tmp = env_state[list(env_state)[0]] target_reper = copy.deepcopy(self.target_reper) num_s = 1 if isinstance(tmp, list): num_s = len(tmp) target_reper = { k:np.expand_dims(v,0).repeat(num_s, 0) for k,v in self.target_reper.items() } model_input = {} model_input.update(env_state) model_input.update(target_reper) for k in model_input: model_input[k] = toFloatTensor(model_input[k], self.gpu_id) model_input[k].squeeze_() if num_s == 1: model_input[k].unsqueeze_(0) out = self.model.forward(model_input) return out['policy'], out['value']
def model_forward(self, obs, batch_opt=False): model_input = obs.copy() for k in model_input: model_input[k] = toFloatTensor(model_input[k], self.gpu_id) if not batch_opt: model_input[k].unsqueeze_(0) if batch_opt: model_input['hidden'] = ( self.hidden_batch[0][:-1], self.hidden_batch[1][:-1], ) model_input['action_probs'] = self.probs_batch[:-1] else: model_input['hidden'] = ( self.hidden_batch[0][-1:], self.hidden_batch[1][-1:], ) model_input['action_probs'] = self.probs_batch[-1:] out = self.model.forward(model_input) return out