def collect_rollout(self): states, actions = [None]*self.n_steps, [None]*self.n_steps rewards, dones, values = np.zeros((3, self.n_steps, self.envs.num_envs)) for step in range(self.n_steps): action, values[step] = self.agent.act(self.state) states[step], actions[step] = self.state, action self.state, rewards[step], dones[step] = self.envs.step(action) self.log(rewards[step], dones[step]) last_value = self.agent.get_value(self.state) return flatten_lists(states), flatten_lists(actions), rewards, dones, last_value
def collect_rollout(self): states, actions = [None] * self.n_steps, [None] * self.n_steps rewards, dones, values, done_values = np.zeros( (4, self.n_steps, self.envs.num_envs)) for step in range(self.n_steps): action, values[step] = self.agent.act(self.state) states[step], actions[step] = self.state, action self.state, raw_rewards, dones[step] = self.envs.step( action) # self.state:list, len=4, rewards[step], aux_rewards = self.parse_rewards( raw_rewards, self.scale) if sum(dones[step]) > 0: done_values[step] = self.agent.get_value(states[step]) self.log(rewards[step], aux_rewards, dones[step]) last_value = self.agent.get_value(self.state) return flatten_lists(states), flatten_lists( actions), rewards, dones, done_values, last_value
def collect_rollout(self): # 交互并收集训练数据 states, actions = [None] * self.n_steps, [None] * self.n_steps rewards, dones, values = np.zeros( (3, self.n_steps, self.envs.num_envs)) for step in range(self.n_steps): action, values[step] = self.agent.act(self.state) # print('-----', len(action)) states[step], actions[step] = self.state, action self.state, rewards[step], dones[step] = self.envs.step(action) #dones[step] = (dones[step].astype(int) | dones[step-1].astype(int)).astype(float) # 已经结束的就不再继续 self.log( rewards[step], dones[step] ) # 存储本次训练的n_step中的reward的信息,一个reward[step]包含envs.num_envs个元素 last_value = self.agent.get_value(self.state) return flatten_lists(states), flatten_lists( actions), rewards, dones, last_value, self.ep_rews
def collect_rollout(self, nb_epi, eval_flag, agent): states, options, option_masks = [None]*self.n_steps, [None]*self.n_steps, [None]*self.n_steps rewards, values = torch.zeros((2, self.n_steps, self.envs.num_envs)).to(self.device) dones, prev_dones = np.zeros((2, self.n_steps, self.envs.num_envs)) for step in range(self.n_steps): with torch.no_grad(): if eval_flag: if isinstance(agent, GRProp): option, value, option_mask = agent.get_option(self.obs, self.last_dones, eval_flag) else: option, value, option_mask = agent.get_option(self.obs, self.last_dones) else: option, value, option_mask = agent.get_option(self.obs, self.last_dones) options[step] = copy.deepcopy(option) option_masks[step] = copy.deepcopy(option_mask) if self.is_warfare and self.frames >= self.prep_time: # time is up and prepare for the battle self.obs, reward, done, frames, self.total_counts = warfare(self.envs, self.obs) else: self.obs, reward, done, frames = agent.execute(self.obs, option, self.envs) if not done: # for non-warfare maps self.total_counts = copy.deepcopy(self.envs.total_counts[0]) # MSGI if self.infer and not eval_flag: self.ilp.insert(self.obs, option, reward, done) # HRL if self.train and not eval_flag: spatials = self.obs['spatials'] comps, eligs, masks = self.obs['meta_states'] steps = self.obs['steps'] states[step] = [spatials, comps, eligs, masks, steps] rewards[step], dones[step], values[step] = reward, done, value # compute records self._compute_records(reward, done, value, frames) # updates & log self.epi_count += done self.active = (self.epi_count < nb_epi) self.last_dones = done self.frames += frames if self.active.sum() == 0: break # terminate when all episodes are finished if self.active.sum() == 0: # ignore current samples return None if self.train and not eval_flag: with torch.no_grad(): last_value = agent.get_value(self.obs).detach() # convert to torch tensor prev_dones = torch.from_numpy(prev_dones).float().to(self.device) dones = torch.from_numpy(dones).float().to(self.device) return (flatten_lists(states), torch.cat(options), rewards, dones, self.init_values, last_value) else: return None