def _batch_set_item( source: Batch, indices: np.ndarray, target: Batch, size: int ) -> None: # for any key chain k, there are four cases # 1. source[k] is non-reserved, but target[k] does not exist or is reserved # 2. source[k] does not exist or is reserved, but target[k] is non-reserved # 3. both source[k] and target[k] are non-reserved # 4. both source[k] and target[k] do not exist or are reserved, do nothing. # A special case in case 4, if target[k] is reserved but source[k] does # not exist, make source[k] reserved, too. for k, vt in target.items(): if not isinstance(vt, Batch) or not vt.is_empty(): # target[k] is non-reserved vs = source.get(k, Batch()) if isinstance(vs, Batch): if vs.is_empty(): # case 2, use __dict__ to avoid many type checks source.__dict__[k] = _create_value(vt[0], size) else: assert isinstance(vt, Batch) _batch_set_item(source.__dict__[k], indices, vt, size) else: # target[k] is reserved # case 1 or special case of case 4 if k not in source.__dict__: source.__dict__[k] = Batch() continue source.__dict__[k][indices] = vt
def collect( self, n_step: int = 0, n_episode: Union[int, List[int]] = 0, random: bool = False, render: Optional[float] = None, log_fn: Optional[Callable[[dict], None]] = None) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect (in each environment). :type n_episode: int or list :param bool random: whether to use random policy for collecting data, defaults to ``False``. :param float render: the sleep time between rendering consecutive frames, defaults to ``None`` (no rendering). :param function log_fn: a function which receives env info, typically for tensorboard logging. .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ if not self._multi_env: n_episode = np.sum(n_episode) start_time = time.time() assert sum([(n_step != 0), (n_episode != 0)]) == 1, \ "One and only one collection number specification is permitted!" cur_step, cur_episode = 0, np.zeros(self.env_num) reward_sum, length_sum = 0., 0 # change ty1_succ_rate_1 = 0. ty1_succ_rate_2 = 0. ty1_succ_rate_3 = 0. ty1_succ_rate_4 = 0. Q_len_1 = 0. Q_len_2 = 0. Q_len_3 = 0. Q_len_4 = 0. energy_effi_1 = 0. energy_effi_2 = 0. energy_effi_3 = 0. energy_effi_4 = 0. avg_rate = 0. avg_power = 0. while True: if cur_step >= 100000 and cur_episode.sum() == 0: warnings.warn( 'There are already many steps in an episode. ' 'You should add a time limitation to your environment!', Warning) # restore the state and the input data last_state = self.data.state if last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # calculate the next action if random: action_space = self.env.action_space if isinstance(action_space, list): result = Batch(act=[a.sample() for a in action_space]) else: result = Batch(act=self._make_batch(action_space.sample())) else: with torch.no_grad(): result = self.policy(self.data, last_state) # convert None to Batch(), since None is reserved for 0-init state = result.get('state', Batch()) if state is None: state = Batch() self.data.state = state if hasattr(result, 'policy'): self.data.policy = to_numpy(result.policy) # save hidden state to policy._state, in order to save into buffer self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: self.data.act += self._action_noise(self.data.act.shape) # step in env obs_next, rew, done, info = self.env.step( self.data.act if self._multi_env else self.data.act[0]) # move data to self.data if not self._multi_env: obs_next = self._make_batch(obs_next) rew = self._make_batch(rew) done = self._make_batch(done) info = self._make_batch(info) self.data.obs_next = obs_next self.data.rew = rew self.data.done = done self.data.info = info if log_fn: log_fn(info if self._multi_env else info[0]) if render: self.render() if render > 0: time.sleep(render) # add data into the buffer self.length += 1 self.reward += self.data.rew if self.preprocess_fn: result = self.preprocess_fn(**self.data) self.data.update(result) if self._multi_env: # cache_buffer branch # change if self.data.done[0]: ty1_succ_rate_1 += self.data.info[0]['ty1_succ_rate_1'] ty1_succ_rate_2 += self.data.info[0]['ty1_succ_rate_2'] ty1_succ_rate_3 += self.data.info[0]['ty1_succ_rate_3'] ty1_succ_rate_4 += self.data.info[0]['ty1_succ_rate_4'] Q_len_1 += self.data.info[0]['Q_len_1'] Q_len_2 += self.data.info[0]['Q_len_2'] Q_len_3 += self.data.info[0]['Q_len_3'] Q_len_4 += self.data.info[0]['Q_len_4'] energy_effi_1 += self.data.info[0]['energy_effi_1'] energy_effi_2 += self.data.info[0]['energy_effi_2'] energy_effi_3 += self.data.info[0]['energy_effi_3'] energy_effi_4 += self.data.info[0]['energy_effi_4'] avg_rate += self.data.info[0]['avg_rate'] avg_power += self.data.info[0]['avg_power'] for i in range(self.env_num): self._cached_buf[i].add(**self.data[i]) if self.data.done[i]: if n_step != 0 or np.isscalar(n_episode) or \ cur_episode[i] < n_episode[i]: cur_episode[i] += 1 reward_sum += self.reward[i] length_sum += self.length[i] if self._cached_buf: cur_step += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) self.reward[i], self.length[i] = 0., 0 if self._cached_buf: self._cached_buf[i].reset() self._reset_state(i) obs_next = self.data.obs_next if sum(self.data.done): env_ind = np.where(self.data.done)[0] obs_reset = self.env.reset(env_ind) if self.preprocess_fn: obs_next[env_ind] = self.preprocess_fn( obs=obs_reset).get('obs', obs_reset) else: obs_next[env_ind] = obs_reset self.data.obs_next = obs_next if n_episode != 0: if isinstance(n_episode, list) and \ (cur_episode >= np.array(n_episode)).all() or \ np.isscalar(n_episode) and \ cur_episode.sum() >= n_episode: break else: # single buffer, without cache_buffer if self.buffer is not None: self.buffer.add(**self.data[0]) cur_step += 1 if self.data.done[0]: # change ty1_succ_rate_1 += self.data.info['ty1_succ_rate_1'] ty1_succ_rate_2 += self.data.info['ty1_succ_rate_2'] ty1_succ_rate_3 += self.data.info['ty1_succ_rate_3'] ty1_succ_rate_4 += self.data.info['ty1_succ_rate_4'] Q_len_1 += self.data.info['Q_len_1'] Q_len_2 += self.data.info['Q_len_2'] Q_len_3 += self.data.info['Q_len_3'] Q_len_4 += self.data.info['Q_len_4'] energy_effi_1 += self.data.info['energy_effi_1'] energy_effi_2 += self.data.info['energy_effi_2'] energy_effi_3 += self.data.info['energy_effi_3'] energy_effi_4 += self.data.info['energy_effi_4'] avg_rate += self.data.info[0]['avg_rate'] avg_power += self.data.info[0]['avg_power'] cur_episode += 1 reward_sum += self.reward[0] length_sum += self.length[0] self.reward, self.length = 0., np.zeros(self.env_num) self.data.state = Batch() obs_next = self._make_batch(self.env.reset()) if self.preprocess_fn: obs_next = self.preprocess_fn(obs=obs_next).get( 'obs', obs_next) self.data.obs_next = obs_next if n_episode != 0 and cur_episode >= n_episode: break if n_step != 0 and cur_step >= n_step: break self.data.obs = self.data.obs_next self.data.obs = self.data.obs_next # generate the statistics cur_episode = sum(cur_episode) duration = max(time.time() - start_time, 1e-9) self.step_speed.add(cur_step / duration) self.episode_speed.add(cur_episode / duration) self.collect_step += cur_step self.collect_episode += cur_episode self.collect_time += duration if isinstance(n_episode, list): n_episode = np.sum(n_episode) else: n_episode = max(cur_episode, 1) reward_sum /= n_episode if np.asanyarray(reward_sum).size > 1: # non-scalar reward_sum reward_sum = self._rew_metric(reward_sum) # change return { 'n/ep': cur_episode, 'n/st': cur_step, 'v/st': self.step_speed.get(), 'v/ep': self.episode_speed.get(), 'rew': reward_sum, 'len': length_sum / n_episode, 'ty1s_1': ty1_succ_rate_1, 'ty1s_2': ty1_succ_rate_2, 'ty1s_3': ty1_succ_rate_3, 'ty1s_4': ty1_succ_rate_4, 'ql_1': Q_len_1, 'ql_2': Q_len_2, 'ql_3': Q_len_3, 'ql_4': Q_len_4, 'ee_1': energy_effi_1, 'ee_2': energy_effi_2, 'ee_3': energy_effi_3, 'ee_4': energy_effi_4, 'avg_r': avg_rate, 'avg_p': avg_power, }
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, random: bool = False, render: Optional[float] = None, no_grad: bool = True, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to False. :param float render: the sleep time between rendering consecutive frames, defaults to None (no rendering). :param bool no_grad: whether to retain gradient in policy.forward, defaults to True (no gradient retaining). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step is not None and n_episode is None and n_step > 0) or ( n_step is None and n_episode is not None and np.sum(n_episode) > 0 ), "Only one of n_step or n_episode is allowed in Collector.collect, " f"got n_step = {n_step}, n_episode = {n_episode}." start_time = time.time() step_count = 0 # episode of each environment episode_count = np.zeros(self.env_num) # If n_episode is a list, and some envs have collected the required # number of episodes, these envs will be recorded in this list, and # they will not be stepped. finished_env_ids = [] rewards = [] whole_data = Batch() if isinstance(n_episode, list): assert len(n_episode) == self.get_env_num() finished_env_ids = [ i for i in self._ready_env_ids if n_episode[i] <= 0] self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( "There are already many steps in an episode. " "You should add a time limitation to your environment!", Warning) is_async = self.is_async or len(finished_env_ids) > 0 if is_async: # self.data are the data for all environments in async # simulation or some envs have finished, # **only a subset of data are disposed**, # so we store the whole data in ``whole_data``, let self.data # to be the data available in ready environments, and finally # set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # calculate the next action if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: if no_grad: with torch.no_grad(): # faster than retain_grad version result = self.policy(self.data, last_state) else: result = self.policy(self.data, last_state) state = result.get("state", Batch()) # convert None to Batch(), since None is reserved for 0-init if state is None: state = Batch() self.data.update(state=state, policy=result.get("policy", Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(state, Batch) and state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: assert isinstance(self.data.act, np.ndarray) self.data.act += self._action_noise(self.data.act.shape) # step in env if not is_async: obs_next, rew, done, info = self.env.step(self.data.act) else: # store computed actions, states, etc _batch_set_item( whole_data, self._ready_env_ids, self.data, self.env_num) # fetch finished data obs_next, rew, done, info = self.env.step( self.data.act, id=self._ready_env_ids) self._ready_env_ids = np.array([i["env_id"] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) if render: self.env.render() time.sleep(render) # add data into the buffer if self.preprocess_fn: result = self.preprocess_fn(**self.data) # type: ignore self.data.update(result) for j, i in enumerate(self._ready_env_ids): # j is the index in current ready_env_ids # i is the index in all environments if self.buffer is None: # users do not want to store data, so we store # small fake data here to make the code clean self._cached_buf[i].add(obs=0, act=0, rew=rew[j], done=0) else: self._cached_buf[i].add(**self.data[j]) if done[j]: if not (isinstance(n_episode, list) and episode_count[i] >= n_episode[i]): episode_count[i] += 1 rewards.append(self._rew_metric( np.sum(self._cached_buf[i].rew, axis=0))) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) if isinstance(n_episode, list) and \ episode_count[i] >= n_episode[i]: # env i has collected enough data, it has finished finished_env_ids.append(i) self._cached_buf[i].reset() self._reset_state(j) obs_next = self.data.obs_next if sum(done): env_ind_local = np.where(done)[0] env_ind_global = self._ready_env_ids[env_ind_local] obs_reset = self.env.reset(env_ind_global) if self.preprocess_fn: obs_reset = self.preprocess_fn( obs=obs_reset).get("obs", obs_reset) obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if is_async: # set data back whole_data = deepcopy(whole_data) # avoid reference in ListBuf _batch_set_item( whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # finished envs are ready, and can be used for the next collection self._ready_env_ids = np.array( self._ready_env_ids.tolist() + finished_env_ids) # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration return { "n/ep": episode_count, "n/st": step_count, "v/st": step_count / duration, "v/ep": episode_count / duration, "rew": np.mean(rewards), "rew_std": np.std(rewards), "len": step_count / episode_count, }
def collect( self, n_step: int = 0, n_episode: Union[int, List[int]] = 0, random: bool = False, render: Optional[float] = None, log_fn: Optional[Callable[[dict], None]] = None) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect (in each environment). :type n_episode: int or list :param bool random: whether to use random policy for collecting data, defaults to ``False``. :param float render: the sleep time between rendering consecutive frames, defaults to ``None`` (no rendering). :param function log_fn: a function which receives env info, typically for tensorboard logging. .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ warning_count = 0 if not self._multi_env: n_episode = np.sum(n_episode) start_time = time.time() assert sum([(n_step != 0), (n_episode != 0)]) == 1, \ "One and only one collection number specification is permitted!" cur_step = 0 cur_episode = np.zeros(self.env_num) if self._multi_env else 0 reward_sum = 0 length_sum = 0 while True: if warning_count >= 100000: warnings.warn( 'There are already many steps in an episode. ' 'You should add a time limitation to your environment!', Warning) batch = Batch(obs=self._obs, act=self._act, rew=self._rew, done=self._done, obs_next=None, info=self._info, policy=None) if random: action_space = self.env.action_space if isinstance(action_space, list): result = Batch(act=[a.sample() for a in action_space]) else: result = Batch(act=self._make_batch(action_space.sample())) else: with torch.no_grad(): result = self.policy(batch, self.state) # save hidden state to policy._state, in order to save into buffer self.state = result.get('state', None) if hasattr(result, 'policy'): self._policy = to_numpy(result.policy) if self.state is not None: self._policy._state = self.state elif self.state is not None: self._policy = Batch(_state=self.state) else: self._policy = [{}] * self.env_num self._act = to_numpy(result.act) if self._action_noise is not None: self._act += self._action_noise(self._act.shape) obs_next, self._rew, self._done, self._info = self.env.step( self._act if self._multi_env else self._act[0]) if not self._multi_env: obs_next = self._make_batch(obs_next) self._rew = self._make_batch(self._rew) self._done = self._make_batch(self._done) self._info = self._make_batch(self._info) if log_fn: log_fn(self._info if self._multi_env else self._info[0]) if render: self.env.render() if render > 0: time.sleep(render) self.length += 1 self.reward += self._rew if self.preprocess_fn: result = self.preprocess_fn(obs=self._obs, act=self._act, rew=self._rew, done=self._done, obs_next=obs_next, info=self._info, policy=self._policy) self._obs = result.get('obs', self._obs) self._act = result.get('act', self._act) self._rew = result.get('rew', self._rew) self._done = result.get('done', self._done) obs_next = result.get('obs_next', obs_next) self._info = result.get('info', self._info) self._policy = result.get('policy', self._policy) if self._multi_env: for i in range(self.env_num): data = { 'obs': self._obs[i], 'act': self._act[i], 'rew': self._rew[i], 'done': self._done[i], 'obs_next': obs_next[i], 'info': self._info[i], 'policy': self._policy[i] } if self._cached_buf: warning_count += 1 self._cached_buf[i].add(**data) elif self._multi_buf: warning_count += 1 self.buffer[i].add(**data) cur_step += 1 else: warning_count += 1 if self.buffer is not None: self.buffer.add(**data) cur_step += 1 if self._done[i]: if n_step != 0 or np.isscalar(n_episode) or \ cur_episode[i] < n_episode[i]: cur_episode[i] += 1 reward_sum += self.reward[i] length_sum += self.length[i] if self._cached_buf: cur_step += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) self.reward[i], self.length[i] = 0, 0 if self._cached_buf: self._cached_buf[i].reset() self._reset_state(i) if sum(self._done): obs_next = self.env.reset(np.where(self._done)[0]) if self.preprocess_fn: obs_next = self.preprocess_fn(obs=obs_next).get( 'obs', obs_next) if n_episode != 0: if isinstance(n_episode, list) and \ (cur_episode >= np.array(n_episode)).all() or \ np.isscalar(n_episode) and \ cur_episode.sum() >= n_episode: break else: if self.buffer is not None: self.buffer.add(self._obs[0], self._act[0], self._rew[0], self._done[0], obs_next[0], self._info[0], self._policy[0]) cur_step += 1 if self._done: cur_episode += 1 reward_sum += self.reward[0] length_sum += self.length self.reward, self.length = 0, 0 self.state = None obs_next = self._make_batch(self.env.reset()) if self.preprocess_fn: obs_next = self.preprocess_fn(obs=obs_next).get( 'obs', obs_next) if n_episode != 0 and cur_episode >= n_episode: break if n_step != 0 and cur_step >= n_step: break self._obs = obs_next self._obs = obs_next if self._multi_env: cur_episode = sum(cur_episode) duration = max(time.time() - start_time, 1e-9) self.step_speed.add(cur_step / duration) self.episode_speed.add(cur_episode / duration) self.collect_step += cur_step self.collect_episode += cur_episode self.collect_time += duration if isinstance(n_episode, list): n_episode = np.sum(n_episode) else: n_episode = max(cur_episode, 1) return { 'n/ep': cur_episode, 'n/st': cur_step, 'v/st': self.step_speed.get(), 'v/ep': self.episode_speed.get(), 'rew': reward_sum / n_episode, 'len': length_sum / n_episode, }
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, # 多少个episodes random: bool = False, render: Optional[float] = None, no_grad: bool = True, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to False. :param float render: the sleep time between rendering consecutive frames, defaults to None (no rendering). :param bool no_grad: whether to retain gradient in policy.forward, defaults to True (no gradient retaining). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step is not None and n_episode is None and n_step > 0) or ( n_step is None and n_episode is not None and np.sum(n_episode) > 0 ), "Only one of n_step or n_episode is allowed in Collector.collect, " f"got n_step = {n_step}, n_episode = {n_episode}." start_time = time.time() step_count = 0 # episode of each environment episode_count = np.zeros(self.env_num) # If n_episode is a list, and some envs have collected the required # number of episodes, these envs will be recorded in this list, and # they will not be stepped. finished_env_ids = [] rewards = [] whole_data = Batch() if isinstance(n_episode, list): assert len(n_episode) == self.get_env_num() finished_env_ids = [ i for i in self._ready_env_ids if n_episode[i] <= 0 ] self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) right, wrong = 0., 0. mate_num = 0. right_index = defaultdict(int) while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( "There are already many steps in an episode. " "You should add a time limitation to your environment!", Warning) is_async = self.is_async or len(finished_env_ids) > 0 if is_async: # self.data are the data for all environments in async(异步的) # simulation or some envs have finished, # **only a subset of data are disposed**, # so we store the whole data in ``whole_data``, let self.data # to be the data available in ready environments, and finally # set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # print("self.data: ", self.data) # print("know: ", self.env.goal_num) # calculate the next action # print("self.data.obs: ", self.data.obs.shape) if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: if no_grad: with torch.no_grad(): # faster than retain_grad version result = self.policy(self.data, last_state) else: result = self.policy(self.data, last_state) # print("result: ", result['logits'].size()) # print("really: ", self.env.goal_num) state = result.get("state", Batch()) # convert None to Batch(), since None is reserved for 0-init if state is None: state = Batch() self.data.update(state=state, policy=result.get("policy", Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(state, Batch) and state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: assert isinstance(self.data.act, np.ndarray) self.data.act += self._action_noise(self.data.act.shape) # step in env # print(self.env_num) # print("is_async: ", is_async) # print("in collect data.act: ", self.data.act) # print('f**k', self.env.goal_num) # print('self.data: ', type(self.data.act)) # print(self.data.act) # 实在不行就在这里修改一下action吧 if not is_async: obs_next, rew, done, info = self.env.step(self.data.act) # print("kk: ", self.env.goal_num) else: # store computed actions, states, etc _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # fetch finished data obs_next, rew, done, info = self.env.step( self.data.act, id=self._ready_env_ids) self._ready_env_ids = np.array([i["env_id"] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data # print("in every step info: ",info) # print("self.data: ", type(self.data)) # self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) # 暂时还不能更新info,得要在更新obs的地方更新info self.data.update(obs_next=obs_next, rew=rew, done=done) # 暂时还不能更新info,得要在更新obs的地方更新info # print("what? ", done) # print("action: ", self.data.act) if render: self.env.render() time.sleep(render) # print('Updatea: ', self.env.goal_num) # add data into the buffer if self.preprocess_fn: result = self.preprocess_fn(**self.data) # type: ignore self.data.update(result) # print("self._ready_env_ids: ", self._ready_env_ids) # print('len: ', [len(self._cached_buf[i]) for i in self._ready_env_ids]) # print("",self.data) for j, i in enumerate(self._ready_env_ids): # print(i,j) # j is the index in current ready_env_ids # i is the index in all environments if self.buffer is None: # users do not want to store data, so we store # small fake data here to make the code clean self._cached_buf[i].add( obs=0, act=0, rew=rew[j], done=0) # 每一个env都有一个cached_bug收集已经经历过的状态 else: self._cached_buf[i].add( **self.data[j]) # 增加,并非覆盖,所以过程中的所有采样都会被采样到 # print("maybe: ", self.env.goal_num) # print("buffer: ") # print(self.buffer) # print("done: ",done) if done[j]: if not (isinstance(n_episode, list) and episode_count[i] >= n_episode[i]): episode_count[i] += 1 rewards.append( self._rew_metric( np.sum(self._cached_buf[i].rew, axis=0))) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) if isinstance(n_episode, list) and \ episode_count[i] >= n_episode[i]: # env i has collected enough data, it has finished finished_env_ids.append(i) # print("right? ", info[j]['right']) # print("two: ", self.env.goal_num) mate_num += info[j]['mate_num'] # print("mate_num:", mate_num) if info[j]['right']: right += 1 right_index[info[j]['ans']] += 1 else: wrong += 1 self._cached_buf[i].reset() self._reset_state(j) # print("really?", i, j) # print("three: ", self.env.goal_num) # print("after done: ", self.data['obs_next']) obs_next = self.data.obs_next self.data.info = info if sum(done): ##在这里会自动更新一个新的state env_ind_local = np.where(done)[0] env_ind_global = self._ready_env_ids[env_ind_local] # print("env_ind_global: ", env_ind_global) obs_reset = self.env.reset(env_ind_global) self.data['info']['history'][env_ind_local] = np.where( obs_reset != 0, np.ones_like(obs_reset), np.zeros_like(obs_reset)) self.data['info']['turn'][env_ind_local] = np.zeros( len(env_ind_global)) # print("Data: ", self.data) # print("obs_reset: ",obs_reset) if self.preprocess_fn: obs_reset = self.preprocess_fn(obs=obs_reset).get( "obs", obs_reset) obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if is_async: # set data back whole_data = deepcopy(whole_data) # avoid reference in ListBuf _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # finished envs are ready, and can be used for the next collection self._ready_env_ids = np.array(self._ready_env_ids.tolist() + finished_env_ids) # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration # print(self.env_num == 2, right + wrong) return { "n/ep": episode_count, "n/st": step_count, "v/st": step_count / duration, "v/ep": episode_count / duration, "rew": np.mean(rewards), "rew_std": np.std(rewards), "len": step_count / episode_count, "hit_rate": right / (right + wrong), "class_rate": right_index, 'mate_num': mate_num }
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, random: bool = False, render: Optional[float] = None, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to ``False``. :param float render: the sleep time between rendering consecutive frames, defaults to ``None`` (no rendering). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step and not n_episode) or (not n_step and n_episode), \ "One and only one collection number specification is permitted!" start_time = time.time() step_count = 0 # episode of each environment episode_count = np.zeros(self.env_num) reward_total = 0.0 whole_data = Batch() while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( 'There are already many steps in an episode. ' 'You should add a time limitation to your environment!', Warning) if self.is_async: # self.data are the data for all environments # in async simulation, only a subset of data are disposed # so we store the whole data in ``whole_data``, let self.data # to be all the data available in ready environments, and # finally set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # calculate the next action if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: with torch.no_grad(): result = self.policy(self.data, last_state) state = result.get('state', Batch()) # convert None to Batch(), since None is reserved for 0-init if state is None: state = Batch() self.data.update(state=state, policy=result.get('policy', Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(self.data.state, Batch) and self.data.state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: self.data.act += self._action_noise(self.data.act.shape) # step in env if not self.is_async: obs_next, rew, done, info = self.env.step(self.data.act) else: # store computed actions, states, etc _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # fetch finished data obs_next, rew, done, info = self.env.step( action=self.data.act, id=self._ready_env_ids) self._ready_env_ids = np.array([i['env_id'] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) if render: self.render() time.sleep(render) # add data into the buffer if self.preprocess_fn: result = self.preprocess_fn(**self.data) self.data.update(result) for j, i in enumerate(self._ready_env_ids): # j is the index in current ready_env_ids # i is the index in all environments self._cached_buf[i].add(**self.data[j]) if self.data.done[j]: if n_step or np.isscalar(n_episode) or \ episode_count[i] < n_episode[i]: episode_count[i] += 1 reward_total += np.sum(self._cached_buf[i].rew, axis=0) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) self._cached_buf[i].reset() self._reset_state(j) obs_next = self.data.obs_next if sum(self.data.done): env_ind_local = np.where(self.data.done)[0] env_ind_global = self._ready_env_ids[env_ind_local] obs_reset = self.env.reset(env_ind_global) if self.preprocess_fn: obs_next[env_ind_local] = self.preprocess_fn( obs=obs_reset).get('obs', obs_reset) else: obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if self.is_async: # set data back _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration # average reward across the number of episodes reward_avg = reward_total / episode_count if np.asanyarray(reward_avg).size > 1: # non-scalar reward_avg reward_avg = self._rew_metric(reward_avg) return { 'n/ep': episode_count, 'n/st': step_count, 'v/st': step_count / duration, 'v/ep': episode_count / duration, 'rew': reward_avg, 'len': step_count / episode_count, }
def collect( self, n_step: Optional[int] = None, n_episode: Optional[Union[int, List[int]]] = None, random: bool = False, render: Optional[float] = None, no_grad: bool = True, ) -> Dict[str, float]: """Collect a specified number of step or episode. :param int n_step: how many steps you want to collect. :param n_episode: how many episodes you want to collect. If it is an int, it means to collect at lease ``n_episode`` episodes; if it is a list, it means to collect exactly ``n_episode[i]`` episodes in the i-th environment :param bool random: whether to use random policy for collecting data, defaults to ``False``. :param float render: the sleep time between rendering consecutive frames, defaults to ``None`` (no rendering). :param bool no_grad: whether to retain gradient in policy.forward, defaults to ``True`` (no gradient retaining). .. note:: One and only one collection number specification is permitted, either ``n_step`` or ``n_episode``. :return: A dict including the following keys * ``n/ep`` the collected number of episodes. * ``n/st`` the collected number of steps. * ``v/st`` the speed of steps per second. * ``v/ep`` the speed of episode per second. * ``rew`` the mean reward over collected episodes. * ``len`` the mean length over collected episodes. """ assert (n_step is not None and n_episode is None and n_step > 0) or ( n_step is None and n_episode is not None and np.sum(n_episode) > 0 ), "Only one of n_step or n_episode is allowed in Collector.collect, " f"got n_step = {n_step}, n_episode = {n_episode}." start_time = time.time() step_count = 0 # episode of each environment # 每一个环境进行的episode次数 episode_count = np.zeros(self.env_num) # If n_episode is a list, and some envs have collected the required # number of episodes, these envs will be recorded in this list, and # they will not be stepped. # 如果有的环境中已经达到了迭代的轮次,那么它们将不再被运行。 finished_env_ids = [] reward_total = 0.0 whole_data = Batch() list_n_episode = False # 多环境不同episode初始化处理 if n_episode is not None and not np.isscalar(n_episode): assert len(n_episode) == self.get_env_num() # 标记为多环境运行不同episode list_n_episode = True finished_env_ids = [ i for i in self._ready_env_ids if n_episode[i] <= 0 ] self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) while True: if step_count >= 100000 and episode_count.sum() == 0: warnings.warn( 'There are already many steps in an episode. ' 'You should add a time limitation to your environment!', Warning) # 如果本身设计的就是异步运行,或者一些环境运行已结束,则启动异步收集 is_async = self.is_async or len(finished_env_ids) > 0 if is_async: # self.data are the data for all environments in async # simulation or some envs have finished, # **only a subset of data are disposed**, # so we store the whole data in ``whole_data``, let self.data # to be the data available in ready environments, and finally # set these back into all the data whole_data = self.data self.data = self.data[self._ready_env_ids] # restore the state and the input data last_state = self.data.state if isinstance(last_state, Batch) and last_state.is_empty(): last_state = None self.data.update(state=Batch(), obs_next=Batch(), policy=Batch()) # calculate the next action # print(type(self.data.obs)) if random: spaces = self._action_space result = Batch( act=[spaces[i].sample() for i in self._ready_env_ids]) else: if no_grad: with torch.no_grad(): # faster than retain_grad version result = self.policy(self.data, last_state) else: result = self.policy(self.data, last_state) # 在RNN的RL方法中使用state state = result.get('state', Batch()) # convert None to Batch(), since None is reserved for 0-init # 这行代码可以删除??? if state is None: state = Batch() self.data.update(state=state, policy=result.get('policy', Batch())) # save hidden state to policy._state, in order to save into buffer if not (isinstance(state, Batch) and state.is_empty()): self.data.policy._state = self.data.state self.data.act = to_numpy(result.act) if self._action_noise is not None: # noqa self.data.act += self._action_noise(self.data.act.shape) # step in env if not is_async: obs_next, rew, done, info = self.env.step(self.data.act) else: # store computed actions, states, etc # 把self.data中得到的新的值赋给whole_data _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # print(self._ready_env_ids, 1) # fetch finished data obs_next, rew, done, info = self.env.step( self.data.act, id=self._ready_env_ids) # print(self._ready_env_ids, 2) # 这行代码可以删除??? self._ready_env_ids = np.array([i['env_id'] for i in info]) # get the stepped data self.data = whole_data[self._ready_env_ids] # move data to self.data self.data.update(obs_next=obs_next, rew=rew, done=done, info=info) if render: self.render() time.sleep(render) # 在加入buffer之前对数据进行预处理 if self.preprocess_fn: result = self.preprocess_fn(**self.data) self.data.update(result) # add data into the buffer # 首先将这一step的数据存储到对应的_cached_buf中,如果当前这一step对应了一个done状态,那么则清空对应的_cached_buf将其加入到buffer中 for j, i in enumerate(self._ready_env_ids): # j is the index in current ready_env_ids # i is the index in all environments if self.buffer is None: # users do not want to store data, so we store # small fake data here to make the code clean self._cached_buf[i].add(obs=0, act=0, rew=rew[j], done=0) else: self._cached_buf[i].add(**self.data[j]) if done[j]: if not (list_n_episode and episode_count[i] >= n_episode[i]): episode_count[i] += 1 reward_total += np.sum(self._cached_buf[i].rew, axis=0) step_count += len(self._cached_buf[i]) if self.buffer is not None: self.buffer.update(self._cached_buf[i]) if list_n_episode and \ episode_count[i] >= n_episode[i]: # env i has collected enough data, it has finished finished_env_ids.append(i) self._cached_buf[i].reset() self._reset_state(j) # 更新当前状态 obs_next = self.data.obs_next # 如果有已完结状态,则将对应的环境reset() if sum(done): #np.where返回一个元组,维度与输入相同 env_ind_local = np.where(done)[0] env_ind_global = self._ready_env_ids[env_ind_local] obs_reset = self.env.reset(env_ind_global) if self.preprocess_fn: obs_next[env_ind_local] = self.preprocess_fn( obs=obs_reset).get('obs', obs_reset) else: obs_next[env_ind_local] = obs_reset self.data.obs = obs_next if is_async: # set data back whole_data = deepcopy(whole_data) # avoid reference in ListBuf _batch_set_item(whole_data, self._ready_env_ids, self.data, self.env_num) # let self.data be the data in all environments again self.data = whole_data self._ready_env_ids = np.array( [x for x in self._ready_env_ids if x not in finished_env_ids]) if n_step: if step_count >= n_step: break else: if isinstance(n_episode, int) and \ episode_count.sum() >= n_episode: break if isinstance(n_episode, list) and \ (episode_count >= n_episode).all(): break # finished envs are ready, and can be used for the next collection self._ready_env_ids = np.array(self._ready_env_ids.tolist() + finished_env_ids) # generate the statistics episode_count = sum(episode_count) duration = max(time.time() - start_time, 1e-9) self.collect_step += step_count self.collect_episode += episode_count self.collect_time += duration # average reward across the number of episodes reward_avg = reward_total / episode_count if np.asanyarray(reward_avg).size > 1: # non-scalar reward_avg reward_avg = self._rew_metric(reward_avg) return { 'n/ep': episode_count, 'n/st': step_count, 'v/st': step_count / duration, 'v/ep': episode_count / duration, 'rew': reward_avg, 'len': step_count / episode_count, }