コード例 #1
0
    def collect_rollout(self):
        states, actions = [None]*self.n_steps, [None]*self.n_steps
        rewards, dones, values = np.zeros((3, self.n_steps, self.envs.num_envs))

        for step in range(self.n_steps):
            action, values[step] = self.agent.act(self.state)
            states[step], actions[step] = self.state, action
            self.state, rewards[step], dones[step] = self.envs.step(action)

            self.log(rewards[step], dones[step])

        last_value = self.agent.get_value(self.state)

        return flatten_lists(states), flatten_lists(actions), rewards, dones, last_value
コード例 #2
0
ファイル: runner.py プロジェクト: MCCCSunny/PMES
    def collect_rollout(self):
        states, actions = [None] * self.n_steps, [None] * self.n_steps
        rewards, dones, values, done_values = np.zeros(
            (4, self.n_steps, self.envs.num_envs))

        for step in range(self.n_steps):
            action, values[step] = self.agent.act(self.state)
            states[step], actions[step] = self.state, action
            self.state, raw_rewards, dones[step] = self.envs.step(
                action)  # self.state:list, len=4,
            rewards[step], aux_rewards = self.parse_rewards(
                raw_rewards, self.scale)
            if sum(dones[step]) > 0:
                done_values[step] = self.agent.get_value(states[step])
            self.log(rewards[step], aux_rewards, dones[step])

        last_value = self.agent.get_value(self.state)
        return flatten_lists(states), flatten_lists(
            actions), rewards, dones, done_values, last_value
コード例 #3
0
ファイル: runner.py プロジェクト: xiaoxingyu/pysc2-rl-agent
    def collect_rollout(self):  # 交互并收集训练数据
        states, actions = [None] * self.n_steps, [None] * self.n_steps
        rewards, dones, values = np.zeros(
            (3, self.n_steps, self.envs.num_envs))

        for step in range(self.n_steps):
            action, values[step] = self.agent.act(self.state)
            # print('-----', len(action))
            states[step], actions[step] = self.state, action
            self.state, rewards[step], dones[step] = self.envs.step(action)

            #dones[step] = (dones[step].astype(int) | dones[step-1].astype(int)).astype(float) # 已经结束的就不再继续
            self.log(
                rewards[step], dones[step]
            )  # 存储本次训练的n_step中的reward的信息,一个reward[step]包含envs.num_envs个元素

        last_value = self.agent.get_value(self.state)

        return flatten_lists(states), flatten_lists(
            actions), rewards, dones, last_value, self.ep_rews
コード例 #4
0
    def collect_rollout(self, nb_epi, eval_flag, agent):
        states, options, option_masks = [None]*self.n_steps, [None]*self.n_steps, [None]*self.n_steps
        rewards, values = torch.zeros((2, self.n_steps, self.envs.num_envs)).to(self.device)
        dones, prev_dones = np.zeros((2, self.n_steps, self.envs.num_envs))

        for step in range(self.n_steps):
            with torch.no_grad():
                if eval_flag:
                    if isinstance(agent, GRProp):
                        option, value, option_mask = agent.get_option(self.obs, self.last_dones, eval_flag)
                    else:
                        option, value, option_mask = agent.get_option(self.obs, self.last_dones)
                else:
                    option, value, option_mask = agent.get_option(self.obs, self.last_dones)

            options[step] = copy.deepcopy(option)
            option_masks[step] = copy.deepcopy(option_mask)

            if self.is_warfare and self.frames >= self.prep_time:
                # time is up and prepare for the battle
                self.obs, reward, done, frames, self.total_counts = warfare(self.envs, self.obs)
            else:
                self.obs, reward, done, frames = agent.execute(self.obs, option, self.envs)

                if not done:  # for non-warfare maps
                    self.total_counts = copy.deepcopy(self.envs.total_counts[0])

            # MSGI
            if self.infer and not eval_flag:
                self.ilp.insert(self.obs, option, reward, done)

            # HRL
            if self.train and not eval_flag:
                spatials = self.obs['spatials']
                comps, eligs, masks = self.obs['meta_states']
                steps = self.obs['steps']
                states[step] = [spatials, comps, eligs, masks, steps]
                rewards[step], dones[step], values[step] = reward, done, value

            # compute records
            self._compute_records(reward, done, value, frames)

            # updates & log
            self.epi_count += done
            self.active = (self.epi_count < nb_epi)
            self.last_dones = done
            self.frames += frames

            if self.active.sum() == 0:
                break

        # terminate when all episodes are finished
        if self.active.sum() == 0: # ignore current samples
            return None

        if self.train and not eval_flag:
            with torch.no_grad():
                last_value = agent.get_value(self.obs).detach()

            # convert to torch tensor
            prev_dones = torch.from_numpy(prev_dones).float().to(self.device)
            dones = torch.from_numpy(dones).float().to(self.device)
            return (flatten_lists(states), torch.cat(options), rewards, dones, self.init_values, last_value)
        else:
            return None