Exemple #1
0
def _batch_set_item(
    source: Batch, indices: np.ndarray, target: Batch, size: int
) -> None:
    # for any key chain k, there are four cases
    # 1. source[k] is non-reserved, but target[k] does not exist or is reserved
    # 2. source[k] does not exist or is reserved, but target[k] is non-reserved
    # 3. both source[k] and target[k] are non-reserved
    # 4. both source[k] and target[k] do not exist or are reserved, do nothing.
    # A special case in case 4, if target[k] is reserved but source[k] does
    # not exist, make source[k] reserved, too.
    for k, vt in target.items():
        if not isinstance(vt, Batch) or not vt.is_empty():
            # target[k] is non-reserved
            vs = source.get(k, Batch())
            if isinstance(vs, Batch):
                if vs.is_empty():
                    # case 2, use __dict__ to avoid many type checks
                    source.__dict__[k] = _create_value(vt[0], size)
                else:
                    assert isinstance(vt, Batch)
                    _batch_set_item(source.__dict__[k], indices, vt, size)
        else:
            # target[k] is reserved
            # case 1 or special case of case 4
            if k not in source.__dict__:
                source.__dict__[k] = Batch()
            continue
        source.__dict__[k][indices] = vt
    def collect(
            self,
            n_step: int = 0,
            n_episode: Union[int, List[int]] = 0,
            random: bool = False,
            render: Optional[float] = None,
            log_fn: Optional[Callable[[dict],
                                      None]] = None) -> Dict[str, float]:
        """Collect a specified number of step or episode.

        :param int n_step: how many steps you want to collect.
        :param n_episode: how many episodes you want to collect (in each
            environment).
        :type n_episode: int or list
        :param bool random: whether to use random policy for collecting data,
            defaults to ``False``.
        :param float render: the sleep time between rendering consecutive
            frames, defaults to ``None`` (no rendering).
        :param function log_fn: a function which receives env info, typically
            for tensorboard logging.

        .. note::

            One and only one collection number specification is permitted,
            either ``n_step`` or ``n_episode``.

        :return: A dict including the following keys

            * ``n/ep`` the collected number of episodes.
            * ``n/st`` the collected number of steps.
            * ``v/st`` the speed of steps per second.
            * ``v/ep`` the speed of episode per second.
            * ``rew`` the mean reward over collected episodes.
            * ``len`` the mean length over collected episodes.
        """
        if not self._multi_env:
            n_episode = np.sum(n_episode)
        start_time = time.time()
        assert sum([(n_step != 0), (n_episode != 0)]) == 1, \
            "One and only one collection number specification is permitted!"
        cur_step, cur_episode = 0, np.zeros(self.env_num)
        reward_sum, length_sum = 0., 0

        # change
        ty1_succ_rate_1 = 0.
        ty1_succ_rate_2 = 0.
        ty1_succ_rate_3 = 0.
        ty1_succ_rate_4 = 0.
        Q_len_1 = 0.
        Q_len_2 = 0.
        Q_len_3 = 0.
        Q_len_4 = 0.
        energy_effi_1 = 0.
        energy_effi_2 = 0.
        energy_effi_3 = 0.
        energy_effi_4 = 0.
        avg_rate = 0.
        avg_power = 0.

        while True:
            if cur_step >= 100000 and cur_episode.sum() == 0:
                warnings.warn(
                    'There are already many steps in an episode. '
                    'You should add a time limitation to your environment!',
                    Warning)

            # restore the state and the input data
            last_state = self.data.state
            if last_state.is_empty():
                last_state = None
            self.data.update(state=Batch(), obs_next=Batch(), policy=Batch())

            # calculate the next action
            if random:
                action_space = self.env.action_space
                if isinstance(action_space, list):
                    result = Batch(act=[a.sample() for a in action_space])
                else:
                    result = Batch(act=self._make_batch(action_space.sample()))
            else:
                with torch.no_grad():
                    result = self.policy(self.data, last_state)

            # convert None to Batch(), since None is reserved for 0-init
            state = result.get('state', Batch())
            if state is None:
                state = Batch()
            self.data.state = state
            if hasattr(result, 'policy'):
                self.data.policy = to_numpy(result.policy)
            # save hidden state to policy._state, in order to save into buffer
            self.data.policy._state = self.data.state

            self.data.act = to_numpy(result.act)
            if self._action_noise is not None:
                self.data.act += self._action_noise(self.data.act.shape)

            # step in env
            obs_next, rew, done, info = self.env.step(
                self.data.act if self._multi_env else self.data.act[0])

            # move data to self.data
            if not self._multi_env:
                obs_next = self._make_batch(obs_next)
                rew = self._make_batch(rew)
                done = self._make_batch(done)
                info = self._make_batch(info)
            self.data.obs_next = obs_next
            self.data.rew = rew
            self.data.done = done
            self.data.info = info

            if log_fn:
                log_fn(info if self._multi_env else info[0])
            if render:
                self.render()
                if render > 0:
                    time.sleep(render)

            # add data into the buffer
            self.length += 1
            self.reward += self.data.rew
            if self.preprocess_fn:
                result = self.preprocess_fn(**self.data)
                self.data.update(result)
            if self._multi_env:  # cache_buffer branch
                # change
                if self.data.done[0]:
                    ty1_succ_rate_1 += self.data.info[0]['ty1_succ_rate_1']
                    ty1_succ_rate_2 += self.data.info[0]['ty1_succ_rate_2']
                    ty1_succ_rate_3 += self.data.info[0]['ty1_succ_rate_3']
                    ty1_succ_rate_4 += self.data.info[0]['ty1_succ_rate_4']
                    Q_len_1 += self.data.info[0]['Q_len_1']
                    Q_len_2 += self.data.info[0]['Q_len_2']
                    Q_len_3 += self.data.info[0]['Q_len_3']
                    Q_len_4 += self.data.info[0]['Q_len_4']
                    energy_effi_1 += self.data.info[0]['energy_effi_1']
                    energy_effi_2 += self.data.info[0]['energy_effi_2']
                    energy_effi_3 += self.data.info[0]['energy_effi_3']
                    energy_effi_4 += self.data.info[0]['energy_effi_4']
                    avg_rate += self.data.info[0]['avg_rate']
                    avg_power += self.data.info[0]['avg_power']
                for i in range(self.env_num):
                    self._cached_buf[i].add(**self.data[i])
                    if self.data.done[i]:
                        if n_step != 0 or np.isscalar(n_episode) or \
                                cur_episode[i] < n_episode[i]:
                            cur_episode[i] += 1
                            reward_sum += self.reward[i]
                            length_sum += self.length[i]
                            if self._cached_buf:
                                cur_step += len(self._cached_buf[i])
                                if self.buffer is not None:
                                    self.buffer.update(self._cached_buf[i])
                        self.reward[i], self.length[i] = 0., 0
                        if self._cached_buf:
                            self._cached_buf[i].reset()
                        self._reset_state(i)
                obs_next = self.data.obs_next
                if sum(self.data.done):
                    env_ind = np.where(self.data.done)[0]
                    obs_reset = self.env.reset(env_ind)
                    if self.preprocess_fn:
                        obs_next[env_ind] = self.preprocess_fn(
                            obs=obs_reset).get('obs', obs_reset)
                    else:
                        obs_next[env_ind] = obs_reset
                self.data.obs_next = obs_next
                if n_episode != 0:
                    if isinstance(n_episode, list) and \
                            (cur_episode >= np.array(n_episode)).all() or \
                            np.isscalar(n_episode) and \
                            cur_episode.sum() >= n_episode:
                        break
            else:  # single buffer, without cache_buffer
                if self.buffer is not None:
                    self.buffer.add(**self.data[0])
                cur_step += 1
                if self.data.done[0]:
                    # change
                    ty1_succ_rate_1 += self.data.info['ty1_succ_rate_1']
                    ty1_succ_rate_2 += self.data.info['ty1_succ_rate_2']
                    ty1_succ_rate_3 += self.data.info['ty1_succ_rate_3']
                    ty1_succ_rate_4 += self.data.info['ty1_succ_rate_4']
                    Q_len_1 += self.data.info['Q_len_1']
                    Q_len_2 += self.data.info['Q_len_2']
                    Q_len_3 += self.data.info['Q_len_3']
                    Q_len_4 += self.data.info['Q_len_4']
                    energy_effi_1 += self.data.info['energy_effi_1']
                    energy_effi_2 += self.data.info['energy_effi_2']
                    energy_effi_3 += self.data.info['energy_effi_3']
                    energy_effi_4 += self.data.info['energy_effi_4']
                    avg_rate += self.data.info[0]['avg_rate']
                    avg_power += self.data.info[0]['avg_power']
                    cur_episode += 1
                    reward_sum += self.reward[0]
                    length_sum += self.length[0]
                    self.reward, self.length = 0., np.zeros(self.env_num)
                    self.data.state = Batch()
                    obs_next = self._make_batch(self.env.reset())
                    if self.preprocess_fn:
                        obs_next = self.preprocess_fn(obs=obs_next).get(
                            'obs', obs_next)
                    self.data.obs_next = obs_next
                if n_episode != 0 and cur_episode >= n_episode:
                    break
            if n_step != 0 and cur_step >= n_step:
                break
            self.data.obs = self.data.obs_next
        self.data.obs = self.data.obs_next

        # generate the statistics
        cur_episode = sum(cur_episode)
        duration = max(time.time() - start_time, 1e-9)
        self.step_speed.add(cur_step / duration)
        self.episode_speed.add(cur_episode / duration)
        self.collect_step += cur_step
        self.collect_episode += cur_episode
        self.collect_time += duration
        if isinstance(n_episode, list):
            n_episode = np.sum(n_episode)
        else:
            n_episode = max(cur_episode, 1)
        reward_sum /= n_episode
        if np.asanyarray(reward_sum).size > 1:  # non-scalar reward_sum
            reward_sum = self._rew_metric(reward_sum)
        # change
        return {
            'n/ep': cur_episode,
            'n/st': cur_step,
            'v/st': self.step_speed.get(),
            'v/ep': self.episode_speed.get(),
            'rew': reward_sum,
            'len': length_sum / n_episode,
            'ty1s_1': ty1_succ_rate_1,
            'ty1s_2': ty1_succ_rate_2,
            'ty1s_3': ty1_succ_rate_3,
            'ty1s_4': ty1_succ_rate_4,
            'ql_1': Q_len_1,
            'ql_2': Q_len_2,
            'ql_3': Q_len_3,
            'ql_4': Q_len_4,
            'ee_1': energy_effi_1,
            'ee_2': energy_effi_2,
            'ee_3': energy_effi_3,
            'ee_4': energy_effi_4,
            'avg_r': avg_rate,
            'avg_p': avg_power,
        }
Exemple #3
0
    def collect(
        self,
        n_step: Optional[int] = None,
        n_episode: Optional[Union[int, List[int]]] = None,
        random: bool = False,
        render: Optional[float] = None,
        no_grad: bool = True,
    ) -> Dict[str, float]:
        """Collect a specified number of step or episode.

        :param int n_step: how many steps you want to collect.
        :param n_episode: how many episodes you want to collect. If it is an
            int, it means to collect at lease ``n_episode`` episodes; if it is
            a list, it means to collect exactly ``n_episode[i]`` episodes in
            the i-th environment
        :param bool random: whether to use random policy for collecting data,
            defaults to False.
        :param float render: the sleep time between rendering consecutive
            frames, defaults to None (no rendering).
        :param bool no_grad: whether to retain gradient in policy.forward,
            defaults to True (no gradient retaining).

        .. note::

            One and only one collection number specification is permitted,
            either ``n_step`` or ``n_episode``.

        :return: A dict including the following keys

            * ``n/ep`` the collected number of episodes.
            * ``n/st`` the collected number of steps.
            * ``v/st`` the speed of steps per second.
            * ``v/ep`` the speed of episode per second.
            * ``rew`` the mean reward over collected episodes.
            * ``len`` the mean length over collected episodes.
        """
        assert (n_step is not None and n_episode is None and n_step > 0) or (
            n_step is None and n_episode is not None and np.sum(n_episode) > 0
        ), "Only one of n_step or n_episode is allowed in Collector.collect, "
        f"got n_step = {n_step}, n_episode = {n_episode}."
        start_time = time.time()
        step_count = 0
        # episode of each environment
        episode_count = np.zeros(self.env_num)
        # If n_episode is a list, and some envs have collected the required
        # number of episodes, these envs will be recorded in this list, and
        # they will not be stepped.
        finished_env_ids = []
        rewards = []
        whole_data = Batch()
        if isinstance(n_episode, list):
            assert len(n_episode) == self.get_env_num()
            finished_env_ids = [
                i for i in self._ready_env_ids if n_episode[i] <= 0]
            self._ready_env_ids = np.array(
                [x for x in self._ready_env_ids if x not in finished_env_ids])
        while True:
            if step_count >= 100000 and episode_count.sum() == 0:
                warnings.warn(
                    "There are already many steps in an episode. "
                    "You should add a time limitation to your environment!",
                    Warning)

            is_async = self.is_async or len(finished_env_ids) > 0
            if is_async:
                # self.data are the data for all environments in async
                # simulation or some envs have finished,
                # **only a subset of data are disposed**,
                # so we store the whole data in ``whole_data``, let self.data
                # to be the data available in ready environments, and finally
                # set these back into all the data
                whole_data = self.data
                self.data = self.data[self._ready_env_ids]

            # restore the state and the input data
            last_state = self.data.state
            if isinstance(last_state, Batch) and last_state.is_empty():
                last_state = None
            self.data.update(state=Batch(), obs_next=Batch(), policy=Batch())

            # calculate the next action
            if random:
                spaces = self._action_space
                result = Batch(
                    act=[spaces[i].sample() for i in self._ready_env_ids])
            else:
                if no_grad:
                    with torch.no_grad():  # faster than retain_grad version
                        result = self.policy(self.data, last_state)
                else:
                    result = self.policy(self.data, last_state)

            state = result.get("state", Batch())
            # convert None to Batch(), since None is reserved for 0-init
            if state is None:
                state = Batch()
            self.data.update(state=state, policy=result.get("policy", Batch()))
            # save hidden state to policy._state, in order to save into buffer
            if not (isinstance(state, Batch) and state.is_empty()):
                self.data.policy._state = self.data.state

            self.data.act = to_numpy(result.act)
            if self._action_noise is not None:
                assert isinstance(self.data.act, np.ndarray)
                self.data.act += self._action_noise(self.data.act.shape)

            # step in env
            if not is_async:
                obs_next, rew, done, info = self.env.step(self.data.act)
            else:
                # store computed actions, states, etc
                _batch_set_item(
                    whole_data, self._ready_env_ids, self.data, self.env_num)
                # fetch finished data
                obs_next, rew, done, info = self.env.step(
                    self.data.act, id=self._ready_env_ids)
                self._ready_env_ids = np.array([i["env_id"] for i in info])
                # get the stepped data
                self.data = whole_data[self._ready_env_ids]
            # move data to self.data
            self.data.update(obs_next=obs_next, rew=rew, done=done, info=info)

            if render:
                self.env.render()
                time.sleep(render)

            # add data into the buffer
            if self.preprocess_fn:
                result = self.preprocess_fn(**self.data)  # type: ignore
                self.data.update(result)

            for j, i in enumerate(self._ready_env_ids):
                # j is the index in current ready_env_ids
                # i is the index in all environments
                if self.buffer is None:
                    # users do not want to store data, so we store
                    # small fake data here to make the code clean
                    self._cached_buf[i].add(obs=0, act=0, rew=rew[j], done=0)
                else:
                    self._cached_buf[i].add(**self.data[j])

                if done[j]:
                    if not (isinstance(n_episode, list)
                            and episode_count[i] >= n_episode[i]):
                        episode_count[i] += 1
                        rewards.append(self._rew_metric(
                            np.sum(self._cached_buf[i].rew, axis=0)))
                        step_count += len(self._cached_buf[i])
                        if self.buffer is not None:
                            self.buffer.update(self._cached_buf[i])
                        if isinstance(n_episode, list) and \
                                episode_count[i] >= n_episode[i]:
                            # env i has collected enough data, it has finished
                            finished_env_ids.append(i)
                    self._cached_buf[i].reset()
                    self._reset_state(j)
            obs_next = self.data.obs_next
            if sum(done):
                env_ind_local = np.where(done)[0]
                env_ind_global = self._ready_env_ids[env_ind_local]
                obs_reset = self.env.reset(env_ind_global)
                if self.preprocess_fn:
                    obs_reset = self.preprocess_fn(
                        obs=obs_reset).get("obs", obs_reset)
                obs_next[env_ind_local] = obs_reset
            self.data.obs = obs_next
            if is_async:
                # set data back
                whole_data = deepcopy(whole_data)  # avoid reference in ListBuf
                _batch_set_item(
                    whole_data, self._ready_env_ids, self.data, self.env_num)
                # let self.data be the data in all environments again
                self.data = whole_data
            self._ready_env_ids = np.array(
                [x for x in self._ready_env_ids if x not in finished_env_ids])
            if n_step:
                if step_count >= n_step:
                    break
            else:
                if isinstance(n_episode, int) and \
                        episode_count.sum() >= n_episode:
                    break
                if isinstance(n_episode, list) and \
                        (episode_count >= n_episode).all():
                    break

        # finished envs are ready, and can be used for the next collection
        self._ready_env_ids = np.array(
            self._ready_env_ids.tolist() + finished_env_ids)

        # generate the statistics
        episode_count = sum(episode_count)
        duration = max(time.time() - start_time, 1e-9)
        self.collect_step += step_count
        self.collect_episode += episode_count
        self.collect_time += duration
        return {
            "n/ep": episode_count,
            "n/st": step_count,
            "v/st": step_count / duration,
            "v/ep": episode_count / duration,
            "rew": np.mean(rewards),
            "rew_std": np.std(rewards),
            "len": step_count / episode_count,
        }
Exemple #4
0
    def collect(
            self,
            n_step: int = 0,
            n_episode: Union[int, List[int]] = 0,
            random: bool = False,
            render: Optional[float] = None,
            log_fn: Optional[Callable[[dict],
                                      None]] = None) -> Dict[str, float]:
        """Collect a specified number of step or episode.

        :param int n_step: how many steps you want to collect.
        :param n_episode: how many episodes you want to collect (in each
            environment).
        :type n_episode: int or list
        :param bool random: whether to use random policy for collecting data,
            defaults to ``False``.
        :param float render: the sleep time between rendering consecutive
            frames, defaults to ``None`` (no rendering).
        :param function log_fn: a function which receives env info, typically
            for tensorboard logging.

        .. note::

            One and only one collection number specification is permitted,
            either ``n_step`` or ``n_episode``.

        :return: A dict including the following keys

            * ``n/ep`` the collected number of episodes.
            * ``n/st`` the collected number of steps.
            * ``v/st`` the speed of steps per second.
            * ``v/ep`` the speed of episode per second.
            * ``rew`` the mean reward over collected episodes.
            * ``len`` the mean length over collected episodes.
        """
        warning_count = 0
        if not self._multi_env:
            n_episode = np.sum(n_episode)
        start_time = time.time()
        assert sum([(n_step != 0), (n_episode != 0)]) == 1, \
            "One and only one collection number specification is permitted!"
        cur_step = 0
        cur_episode = np.zeros(self.env_num) if self._multi_env else 0
        reward_sum = 0
        length_sum = 0
        while True:
            if warning_count >= 100000:
                warnings.warn(
                    'There are already many steps in an episode. '
                    'You should add a time limitation to your environment!',
                    Warning)
            batch = Batch(obs=self._obs,
                          act=self._act,
                          rew=self._rew,
                          done=self._done,
                          obs_next=None,
                          info=self._info,
                          policy=None)
            if random:
                action_space = self.env.action_space
                if isinstance(action_space, list):
                    result = Batch(act=[a.sample() for a in action_space])
                else:
                    result = Batch(act=self._make_batch(action_space.sample()))
            else:
                with torch.no_grad():
                    result = self.policy(batch, self.state)

            # save hidden state to policy._state, in order to save into buffer
            self.state = result.get('state', None)
            if hasattr(result, 'policy'):
                self._policy = to_numpy(result.policy)
                if self.state is not None:
                    self._policy._state = self.state
            elif self.state is not None:
                self._policy = Batch(_state=self.state)
            else:
                self._policy = [{}] * self.env_num

            self._act = to_numpy(result.act)
            if self._action_noise is not None:
                self._act += self._action_noise(self._act.shape)
            obs_next, self._rew, self._done, self._info = self.env.step(
                self._act if self._multi_env else self._act[0])
            if not self._multi_env:
                obs_next = self._make_batch(obs_next)
                self._rew = self._make_batch(self._rew)
                self._done = self._make_batch(self._done)
                self._info = self._make_batch(self._info)
            if log_fn:
                log_fn(self._info if self._multi_env else self._info[0])
            if render:
                self.env.render()
                if render > 0:
                    time.sleep(render)
            self.length += 1
            self.reward += self._rew
            if self.preprocess_fn:
                result = self.preprocess_fn(obs=self._obs,
                                            act=self._act,
                                            rew=self._rew,
                                            done=self._done,
                                            obs_next=obs_next,
                                            info=self._info,
                                            policy=self._policy)
                self._obs = result.get('obs', self._obs)
                self._act = result.get('act', self._act)
                self._rew = result.get('rew', self._rew)
                self._done = result.get('done', self._done)
                obs_next = result.get('obs_next', obs_next)
                self._info = result.get('info', self._info)
                self._policy = result.get('policy', self._policy)
            if self._multi_env:
                for i in range(self.env_num):
                    data = {
                        'obs': self._obs[i],
                        'act': self._act[i],
                        'rew': self._rew[i],
                        'done': self._done[i],
                        'obs_next': obs_next[i],
                        'info': self._info[i],
                        'policy': self._policy[i]
                    }
                    if self._cached_buf:
                        warning_count += 1
                        self._cached_buf[i].add(**data)
                    elif self._multi_buf:
                        warning_count += 1
                        self.buffer[i].add(**data)
                        cur_step += 1
                    else:
                        warning_count += 1
                        if self.buffer is not None:
                            self.buffer.add(**data)
                        cur_step += 1
                    if self._done[i]:
                        if n_step != 0 or np.isscalar(n_episode) or \
                                cur_episode[i] < n_episode[i]:
                            cur_episode[i] += 1
                            reward_sum += self.reward[i]
                            length_sum += self.length[i]
                            if self._cached_buf:
                                cur_step += len(self._cached_buf[i])
                                if self.buffer is not None:
                                    self.buffer.update(self._cached_buf[i])
                        self.reward[i], self.length[i] = 0, 0
                        if self._cached_buf:
                            self._cached_buf[i].reset()
                        self._reset_state(i)
                if sum(self._done):
                    obs_next = self.env.reset(np.where(self._done)[0])
                    if self.preprocess_fn:
                        obs_next = self.preprocess_fn(obs=obs_next).get(
                            'obs', obs_next)
                if n_episode != 0:
                    if isinstance(n_episode, list) and \
                            (cur_episode >= np.array(n_episode)).all() or \
                            np.isscalar(n_episode) and \
                            cur_episode.sum() >= n_episode:
                        break
            else:
                if self.buffer is not None:
                    self.buffer.add(self._obs[0], self._act[0], self._rew[0],
                                    self._done[0], obs_next[0], self._info[0],
                                    self._policy[0])
                cur_step += 1
                if self._done:
                    cur_episode += 1
                    reward_sum += self.reward[0]
                    length_sum += self.length
                    self.reward, self.length = 0, 0
                    self.state = None
                    obs_next = self._make_batch(self.env.reset())
                    if self.preprocess_fn:
                        obs_next = self.preprocess_fn(obs=obs_next).get(
                            'obs', obs_next)
                if n_episode != 0 and cur_episode >= n_episode:
                    break
            if n_step != 0 and cur_step >= n_step:
                break
            self._obs = obs_next
        self._obs = obs_next
        if self._multi_env:
            cur_episode = sum(cur_episode)
        duration = max(time.time() - start_time, 1e-9)
        self.step_speed.add(cur_step / duration)
        self.episode_speed.add(cur_episode / duration)
        self.collect_step += cur_step
        self.collect_episode += cur_episode
        self.collect_time += duration
        if isinstance(n_episode, list):
            n_episode = np.sum(n_episode)
        else:
            n_episode = max(cur_episode, 1)
        return {
            'n/ep': cur_episode,
            'n/st': cur_step,
            'v/st': self.step_speed.get(),
            'v/ep': self.episode_speed.get(),
            'rew': reward_sum / n_episode,
            'len': length_sum / n_episode,
        }
Exemple #5
0
    def collect(
        self,
        n_step: Optional[int] = None,
        n_episode: Optional[Union[int, List[int]]] = None,  # 多少个episodes
        random: bool = False,
        render: Optional[float] = None,
        no_grad: bool = True,
    ) -> Dict[str, float]:
        """Collect a specified number of step or episode.

        :param int n_step: how many steps you want to collect.
        :param n_episode: how many episodes you want to collect. If it is an
            int, it means to collect at lease ``n_episode`` episodes; if it is
            a list, it means to collect exactly ``n_episode[i]`` episodes in
            the i-th environment
        :param bool random: whether to use random policy for collecting data,
            defaults to False.
        :param float render: the sleep time between rendering consecutive
            frames, defaults to None (no rendering).
        :param bool no_grad: whether to retain gradient in policy.forward,
            defaults to True (no gradient retaining).

        .. note::

            One and only one collection number specification is permitted,
            either ``n_step`` or ``n_episode``.

        :return: A dict including the following keys

            * ``n/ep`` the collected number of episodes.
            * ``n/st`` the collected number of steps.
            * ``v/st`` the speed of steps per second.
            * ``v/ep`` the speed of episode per second.
            * ``rew`` the mean reward over collected episodes.
            * ``len`` the mean length over collected episodes.
        """
        assert (n_step is not None and n_episode is None and n_step > 0) or (
            n_step is None and n_episode is not None and np.sum(n_episode) > 0
        ), "Only one of n_step or n_episode is allowed in Collector.collect, "
        f"got n_step = {n_step}, n_episode = {n_episode}."
        start_time = time.time()
        step_count = 0
        # episode of each environment
        episode_count = np.zeros(self.env_num)
        # If n_episode is a list, and some envs have collected the required
        # number of episodes, these envs will be recorded in this list, and
        # they will not be stepped.

        finished_env_ids = []
        rewards = []
        whole_data = Batch()
        if isinstance(n_episode, list):
            assert len(n_episode) == self.get_env_num()
            finished_env_ids = [
                i for i in self._ready_env_ids if n_episode[i] <= 0
            ]
            self._ready_env_ids = np.array(
                [x for x in self._ready_env_ids if x not in finished_env_ids])
        right, wrong = 0., 0.
        mate_num = 0.
        right_index = defaultdict(int)
        while True:
            if step_count >= 100000 and episode_count.sum() == 0:
                warnings.warn(
                    "There are already many steps in an episode. "
                    "You should add a time limitation to your environment!",
                    Warning)

            is_async = self.is_async or len(finished_env_ids) > 0
            if is_async:
                # self.data are the data for all environments in async(异步的)
                # simulation or some envs have finished,
                # **only a subset of data are disposed**,
                # so we store the whole data in ``whole_data``, let self.data
                # to be the data available in ready environments, and finally
                # set these back into all the data
                whole_data = self.data
                self.data = self.data[self._ready_env_ids]

            # restore the state and the input data
            last_state = self.data.state
            if isinstance(last_state, Batch) and last_state.is_empty():
                last_state = None
            self.data.update(state=Batch(), obs_next=Batch(), policy=Batch())
            # print("self.data: ", self.data)
            # print("know: ", self.env.goal_num)
            # calculate the next action
            # print("self.data.obs: ", self.data.obs.shape)
            if random:
                spaces = self._action_space
                result = Batch(
                    act=[spaces[i].sample() for i in self._ready_env_ids])
            else:
                if no_grad:
                    with torch.no_grad():  # faster than retain_grad version
                        result = self.policy(self.data, last_state)
                else:
                    result = self.policy(self.data, last_state)

            # print("result: ", result['logits'].size())
            # print("really: ", self.env.goal_num)
            state = result.get("state", Batch())
            # convert None to Batch(), since None is reserved for 0-init
            if state is None:
                state = Batch()
            self.data.update(state=state, policy=result.get("policy", Batch()))
            # save hidden state to policy._state, in order to save into buffer
            if not (isinstance(state, Batch) and state.is_empty()):
                self.data.policy._state = self.data.state

            self.data.act = to_numpy(result.act)
            if self._action_noise is not None:
                assert isinstance(self.data.act, np.ndarray)
                self.data.act += self._action_noise(self.data.act.shape)

            # step in env
            # print(self.env_num)
            # print("is_async: ", is_async)
            # print("in collect data.act: ", self.data.act)
            # print('f**k', self.env.goal_num)
            # print('self.data: ', type(self.data.act))
            # print(self.data.act)
            # 实在不行就在这里修改一下action吧
            if not is_async:
                obs_next, rew, done, info = self.env.step(self.data.act)
                # print("kk: ", self.env.goal_num)
            else:
                # store computed actions, states, etc
                _batch_set_item(whole_data, self._ready_env_ids, self.data,
                                self.env_num)
                # fetch finished data
                obs_next, rew, done, info = self.env.step(
                    self.data.act, id=self._ready_env_ids)
                self._ready_env_ids = np.array([i["env_id"] for i in info])
                # get the stepped data
                self.data = whole_data[self._ready_env_ids]
            # move data to self.data
            # print("in every step info: ",info)
            # print("self.data: ", type(self.data))
            # self.data.update(obs_next=obs_next, rew=rew, done=done, info=info)  # 暂时还不能更新info,得要在更新obs的地方更新info
            self.data.update(obs_next=obs_next, rew=rew,
                             done=done)  # 暂时还不能更新info,得要在更新obs的地方更新info
            # print("what? ", done)
            # print("action: ", self.data.act)
            if render:
                self.env.render()
                time.sleep(render)
            # print('Updatea: ', self.env.goal_num)
            # add data into the buffer
            if self.preprocess_fn:
                result = self.preprocess_fn(**self.data)  # type: ignore
                self.data.update(result)

            # print("self._ready_env_ids: ", self._ready_env_ids)
            # print('len: ', [len(self._cached_buf[i]) for i in self._ready_env_ids])
            # print("",self.data)
            for j, i in enumerate(self._ready_env_ids):
                # print(i,j)
                # j is the index in current ready_env_ids
                # i is the index in all environments
                if self.buffer is None:
                    # users do not want to store data, so we store
                    # small fake data here to make the code clean
                    self._cached_buf[i].add(
                        obs=0, act=0, rew=rew[j],
                        done=0)  # 每一个env都有一个cached_bug收集已经经历过的状态
                else:
                    self._cached_buf[i].add(
                        **self.data[j])  # 增加,并非覆盖,所以过程中的所有采样都会被采样到
                # print("maybe: ", self.env.goal_num)
                # print("buffer: ")
                # print(self.buffer)
                # print("done: ",done)
                if done[j]:
                    if not (isinstance(n_episode, list)
                            and episode_count[i] >= n_episode[i]):
                        episode_count[i] += 1
                        rewards.append(
                            self._rew_metric(
                                np.sum(self._cached_buf[i].rew, axis=0)))
                        step_count += len(self._cached_buf[i])
                        if self.buffer is not None:
                            self.buffer.update(self._cached_buf[i])
                        if isinstance(n_episode, list) and \
                                episode_count[i] >= n_episode[i]:
                            # env i has collected enough data, it has finished
                            finished_env_ids.append(i)
                    # print("right? ", info[j]['right'])
                    # print("two: ", self.env.goal_num)
                    mate_num += info[j]['mate_num']
                    # print("mate_num:", mate_num)
                    if info[j]['right']:
                        right += 1
                        right_index[info[j]['ans']] += 1
                    else:
                        wrong += 1
                    self._cached_buf[i].reset()
                    self._reset_state(j)
                    # print("really?", i, j)
                    # print("three: ", self.env.goal_num)
                    # print("after done: ", self.data['obs_next'])
            obs_next = self.data.obs_next
            self.data.info = info
            if sum(done):  ##在这里会自动更新一个新的state
                env_ind_local = np.where(done)[0]
                env_ind_global = self._ready_env_ids[env_ind_local]
                # print("env_ind_global: ", env_ind_global)
                obs_reset = self.env.reset(env_ind_global)
                self.data['info']['history'][env_ind_local] = np.where(
                    obs_reset != 0, np.ones_like(obs_reset),
                    np.zeros_like(obs_reset))
                self.data['info']['turn'][env_ind_local] = np.zeros(
                    len(env_ind_global))
                # print("Data: ", self.data)
                # print("obs_reset: ",obs_reset)
                if self.preprocess_fn:
                    obs_reset = self.preprocess_fn(obs=obs_reset).get(
                        "obs", obs_reset)
                obs_next[env_ind_local] = obs_reset
            self.data.obs = obs_next
            if is_async:
                # set data back
                whole_data = deepcopy(whole_data)  # avoid reference in ListBuf
                _batch_set_item(whole_data, self._ready_env_ids, self.data,
                                self.env_num)
                # let self.data be the data in all environments again
                self.data = whole_data
            self._ready_env_ids = np.array(
                [x for x in self._ready_env_ids if x not in finished_env_ids])
            if n_step:
                if step_count >= n_step:
                    break
            else:
                if isinstance(n_episode, int) and \
                        episode_count.sum() >= n_episode:
                    break
                if isinstance(n_episode, list) and \
                        (episode_count >= n_episode).all():
                    break

        # finished envs are ready, and can be used for the next collection
        self._ready_env_ids = np.array(self._ready_env_ids.tolist() +
                                       finished_env_ids)

        # generate the statistics
        episode_count = sum(episode_count)
        duration = max(time.time() - start_time, 1e-9)
        self.collect_step += step_count
        self.collect_episode += episode_count
        self.collect_time += duration
        # print(self.env_num == 2, right + wrong)
        return {
            "n/ep": episode_count,
            "n/st": step_count,
            "v/st": step_count / duration,
            "v/ep": episode_count / duration,
            "rew": np.mean(rewards),
            "rew_std": np.std(rewards),
            "len": step_count / episode_count,
            "hit_rate": right / (right + wrong),
            "class_rate": right_index,
            'mate_num': mate_num
        }
Exemple #6
0
    def collect(
        self,
        n_step: Optional[int] = None,
        n_episode: Optional[Union[int, List[int]]] = None,
        random: bool = False,
        render: Optional[float] = None,
    ) -> Dict[str, float]:
        """Collect a specified number of step or episode.

        :param int n_step: how many steps you want to collect.
        :param n_episode: how many episodes you want to collect. If it is an
            int, it means to collect at lease ``n_episode`` episodes; if it is
            a list, it means to collect exactly ``n_episode[i]`` episodes in
            the i-th environment
        :param bool random: whether to use random policy for collecting data,
            defaults to ``False``.
        :param float render: the sleep time between rendering consecutive
            frames, defaults to ``None`` (no rendering).

        .. note::

            One and only one collection number specification is permitted,
            either ``n_step`` or ``n_episode``.

        :return: A dict including the following keys

            * ``n/ep`` the collected number of episodes.
            * ``n/st`` the collected number of steps.
            * ``v/st`` the speed of steps per second.
            * ``v/ep`` the speed of episode per second.
            * ``rew`` the mean reward over collected episodes.
            * ``len`` the mean length over collected episodes.
        """
        assert (n_step and not n_episode) or (not n_step and n_episode), \
            "One and only one collection number specification is permitted!"
        start_time = time.time()
        step_count = 0
        # episode of each environment
        episode_count = np.zeros(self.env_num)
        reward_total = 0.0
        whole_data = Batch()
        while True:
            if step_count >= 100000 and episode_count.sum() == 0:
                warnings.warn(
                    'There are already many steps in an episode. '
                    'You should add a time limitation to your environment!',
                    Warning)

            if self.is_async:
                # self.data are the data for all environments
                # in async simulation, only a subset of data are disposed
                # so we store the whole data in ``whole_data``, let self.data
                # to be all the data available in ready environments, and
                # finally set these back into all the data
                whole_data = self.data
                self.data = self.data[self._ready_env_ids]

            # restore the state and the input data
            last_state = self.data.state
            if isinstance(last_state, Batch) and last_state.is_empty():
                last_state = None
            self.data.update(state=Batch(), obs_next=Batch(), policy=Batch())

            # calculate the next action
            if random:
                spaces = self._action_space
                result = Batch(
                    act=[spaces[i].sample() for i in self._ready_env_ids])
            else:
                with torch.no_grad():
                    result = self.policy(self.data, last_state)

            state = result.get('state', Batch())
            # convert None to Batch(), since None is reserved for 0-init
            if state is None:
                state = Batch()
            self.data.update(state=state, policy=result.get('policy', Batch()))
            # save hidden state to policy._state, in order to save into buffer
            if not (isinstance(self.data.state, Batch)
                    and self.data.state.is_empty()):
                self.data.policy._state = self.data.state

            self.data.act = to_numpy(result.act)
            if self._action_noise is not None:
                self.data.act += self._action_noise(self.data.act.shape)

            # step in env
            if not self.is_async:
                obs_next, rew, done, info = self.env.step(self.data.act)
            else:
                # store computed actions, states, etc
                _batch_set_item(whole_data, self._ready_env_ids, self.data,
                                self.env_num)
                # fetch finished data
                obs_next, rew, done, info = self.env.step(
                    action=self.data.act, id=self._ready_env_ids)
                self._ready_env_ids = np.array([i['env_id'] for i in info])
                # get the stepped data
                self.data = whole_data[self._ready_env_ids]
            # move data to self.data
            self.data.update(obs_next=obs_next, rew=rew, done=done, info=info)

            if render:
                self.render()
                time.sleep(render)

            # add data into the buffer
            if self.preprocess_fn:
                result = self.preprocess_fn(**self.data)
                self.data.update(result)
            for j, i in enumerate(self._ready_env_ids):
                # j is the index in current ready_env_ids
                # i is the index in all environments
                self._cached_buf[i].add(**self.data[j])
                if self.data.done[j]:
                    if n_step or np.isscalar(n_episode) or \
                            episode_count[i] < n_episode[i]:
                        episode_count[i] += 1
                        reward_total += np.sum(self._cached_buf[i].rew, axis=0)
                        step_count += len(self._cached_buf[i])
                        if self.buffer is not None:
                            self.buffer.update(self._cached_buf[i])
                    self._cached_buf[i].reset()
                    self._reset_state(j)
            obs_next = self.data.obs_next
            if sum(self.data.done):
                env_ind_local = np.where(self.data.done)[0]
                env_ind_global = self._ready_env_ids[env_ind_local]
                obs_reset = self.env.reset(env_ind_global)
                if self.preprocess_fn:
                    obs_next[env_ind_local] = self.preprocess_fn(
                        obs=obs_reset).get('obs', obs_reset)
                else:
                    obs_next[env_ind_local] = obs_reset
            self.data.obs = obs_next
            if self.is_async:
                # set data back
                _batch_set_item(whole_data, self._ready_env_ids, self.data,
                                self.env_num)
                # let self.data be the data in all environments again
                self.data = whole_data
            if n_step:
                if step_count >= n_step:
                    break
            else:
                if isinstance(n_episode, int) and \
                        episode_count.sum() >= n_episode:
                    break
                if isinstance(n_episode, list) and \
                        (episode_count >= n_episode).all():
                    break

        # generate the statistics
        episode_count = sum(episode_count)
        duration = max(time.time() - start_time, 1e-9)
        self.collect_step += step_count
        self.collect_episode += episode_count
        self.collect_time += duration
        # average reward across the number of episodes
        reward_avg = reward_total / episode_count
        if np.asanyarray(reward_avg).size > 1:  # non-scalar reward_avg
            reward_avg = self._rew_metric(reward_avg)
        return {
            'n/ep': episode_count,
            'n/st': step_count,
            'v/st': step_count / duration,
            'v/ep': episode_count / duration,
            'rew': reward_avg,
            'len': step_count / episode_count,
        }
Exemple #7
0
    def collect(
        self,
        n_step: Optional[int] = None,
        n_episode: Optional[Union[int, List[int]]] = None,
        random: bool = False,
        render: Optional[float] = None,
        no_grad: bool = True,
    ) -> Dict[str, float]:
        """Collect a specified number of step or episode.

        :param int n_step: how many steps you want to collect.
        :param n_episode: how many episodes you want to collect. If it is an
            int, it means to collect at lease ``n_episode`` episodes; if it is
            a list, it means to collect exactly ``n_episode[i]`` episodes in
            the i-th environment
        :param bool random: whether to use random policy for collecting data,
            defaults to ``False``.
        :param float render: the sleep time between rendering consecutive
            frames, defaults to ``None`` (no rendering).
        :param bool no_grad: whether to retain gradient in policy.forward,
            defaults to ``True`` (no gradient retaining).

        .. note::

            One and only one collection number specification is permitted,
            either ``n_step`` or ``n_episode``.

        :return: A dict including the following keys

            * ``n/ep`` the collected number of episodes.
            * ``n/st`` the collected number of steps.
            * ``v/st`` the speed of steps per second.
            * ``v/ep`` the speed of episode per second.
            * ``rew`` the mean reward over collected episodes.
            * ``len`` the mean length over collected episodes.
        """
        assert (n_step is not None and n_episode is None and n_step > 0) or (
            n_step is None and n_episode is not None and np.sum(n_episode) > 0
        ), "Only one of n_step or n_episode is allowed in Collector.collect, "
        f"got n_step = {n_step}, n_episode = {n_episode}."
        start_time = time.time()
        step_count = 0
        # episode of each environment
        # 每一个环境进行的episode次数
        episode_count = np.zeros(self.env_num)
        # If n_episode is a list, and some envs have collected the required
        # number of episodes, these envs will be recorded in this list, and
        # they will not be stepped.
        # 如果有的环境中已经达到了迭代的轮次,那么它们将不再被运行。
        finished_env_ids = []
        reward_total = 0.0
        whole_data = Batch()
        list_n_episode = False
        # 多环境不同episode初始化处理
        if n_episode is not None and not np.isscalar(n_episode):
            assert len(n_episode) == self.get_env_num()
            # 标记为多环境运行不同episode
            list_n_episode = True
            finished_env_ids = [
                i for i in self._ready_env_ids if n_episode[i] <= 0
            ]
            self._ready_env_ids = np.array(
                [x for x in self._ready_env_ids if x not in finished_env_ids])
        while True:
            if step_count >= 100000 and episode_count.sum() == 0:
                warnings.warn(
                    'There are already many steps in an episode. '
                    'You should add a time limitation to your environment!',
                    Warning)

            # 如果本身设计的就是异步运行,或者一些环境运行已结束,则启动异步收集
            is_async = self.is_async or len(finished_env_ids) > 0
            if is_async:
                # self.data are the data for all environments in async
                # simulation or some envs have finished,
                # **only a subset of data are disposed**,
                # so we store the whole data in ``whole_data``, let self.data
                # to be the data available in ready environments, and finally
                # set these back into all the data
                whole_data = self.data
                self.data = self.data[self._ready_env_ids]

            # restore the state and the input data
            last_state = self.data.state
            if isinstance(last_state, Batch) and last_state.is_empty():
                last_state = None
            self.data.update(state=Batch(), obs_next=Batch(), policy=Batch())

            # calculate the next action
            # print(type(self.data.obs))
            if random:
                spaces = self._action_space
                result = Batch(
                    act=[spaces[i].sample() for i in self._ready_env_ids])
            else:
                if no_grad:
                    with torch.no_grad():  # faster than retain_grad version
                        result = self.policy(self.data, last_state)
                else:
                    result = self.policy(self.data, last_state)
            # 在RNN的RL方法中使用state
            state = result.get('state', Batch())
            # convert None to Batch(), since None is reserved for 0-init
            # 这行代码可以删除???
            if state is None:
                state = Batch()
            self.data.update(state=state, policy=result.get('policy', Batch()))
            # save hidden state to policy._state, in order to save into buffer
            if not (isinstance(state, Batch) and state.is_empty()):
                self.data.policy._state = self.data.state

            self.data.act = to_numpy(result.act)
            if self._action_noise is not None:  # noqa
                self.data.act += self._action_noise(self.data.act.shape)

            # step in env
            if not is_async:
                obs_next, rew, done, info = self.env.step(self.data.act)
            else:
                # store computed actions, states, etc
                # 把self.data中得到的新的值赋给whole_data
                _batch_set_item(whole_data, self._ready_env_ids, self.data,
                                self.env_num)
                # print(self._ready_env_ids, 1)
                # fetch finished data
                obs_next, rew, done, info = self.env.step(
                    self.data.act, id=self._ready_env_ids)
                # print(self._ready_env_ids, 2)
                # 这行代码可以删除???
                self._ready_env_ids = np.array([i['env_id'] for i in info])
                # get the stepped data
                self.data = whole_data[self._ready_env_ids]
            # move data to self.data
            self.data.update(obs_next=obs_next, rew=rew, done=done, info=info)

            if render:
                self.render()
                time.sleep(render)

            # 在加入buffer之前对数据进行预处理
            if self.preprocess_fn:
                result = self.preprocess_fn(**self.data)
                self.data.update(result)

            # add data into the buffer
            # 首先将这一step的数据存储到对应的_cached_buf中,如果当前这一step对应了一个done状态,那么则清空对应的_cached_buf将其加入到buffer中
            for j, i in enumerate(self._ready_env_ids):
                # j is the index in current ready_env_ids
                # i is the index in all environments
                if self.buffer is None:
                    # users do not want to store data, so we store
                    # small fake data here to make the code clean
                    self._cached_buf[i].add(obs=0, act=0, rew=rew[j], done=0)
                else:
                    self._cached_buf[i].add(**self.data[j])

                if done[j]:
                    if not (list_n_episode
                            and episode_count[i] >= n_episode[i]):
                        episode_count[i] += 1
                        reward_total += np.sum(self._cached_buf[i].rew, axis=0)
                        step_count += len(self._cached_buf[i])
                        if self.buffer is not None:
                            self.buffer.update(self._cached_buf[i])
                        if list_n_episode and \
                                episode_count[i] >= n_episode[i]:
                            # env i has collected enough data, it has finished
                            finished_env_ids.append(i)
                    self._cached_buf[i].reset()
                    self._reset_state(j)
            # 更新当前状态
            obs_next = self.data.obs_next
            # 如果有已完结状态,则将对应的环境reset()
            if sum(done):
                #np.where返回一个元组,维度与输入相同
                env_ind_local = np.where(done)[0]
                env_ind_global = self._ready_env_ids[env_ind_local]
                obs_reset = self.env.reset(env_ind_global)
                if self.preprocess_fn:
                    obs_next[env_ind_local] = self.preprocess_fn(
                        obs=obs_reset).get('obs', obs_reset)
                else:
                    obs_next[env_ind_local] = obs_reset
            self.data.obs = obs_next
            if is_async:
                # set data back
                whole_data = deepcopy(whole_data)  # avoid reference in ListBuf
                _batch_set_item(whole_data, self._ready_env_ids, self.data,
                                self.env_num)
                # let self.data be the data in all environments again
                self.data = whole_data
            self._ready_env_ids = np.array(
                [x for x in self._ready_env_ids if x not in finished_env_ids])
            if n_step:
                if step_count >= n_step:
                    break
            else:
                if isinstance(n_episode, int) and \
                        episode_count.sum() >= n_episode:
                    break
                if isinstance(n_episode, list) and \
                        (episode_count >= n_episode).all():
                    break

        # finished envs are ready, and can be used for the next collection
        self._ready_env_ids = np.array(self._ready_env_ids.tolist() +
                                       finished_env_ids)

        # generate the statistics
        episode_count = sum(episode_count)
        duration = max(time.time() - start_time, 1e-9)
        self.collect_step += step_count
        self.collect_episode += episode_count
        self.collect_time += duration
        # average reward across the number of episodes
        reward_avg = reward_total / episode_count
        if np.asanyarray(reward_avg).size > 1:  # non-scalar reward_avg
            reward_avg = self._rew_metric(reward_avg)
        return {
            'n/ep': episode_count,
            'n/st': step_count,
            'v/st': step_count / duration,
            'v/ep': episode_count / duration,
            'rew': reward_avg,
            'len': step_count / episode_count,
        }