def collect_evaluation(self, itr): assert self.max_trajectories == len(self.envs) traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] completed_traj_infos = list() observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) live_envs = list(range(len(self.envs))) for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) b = 0 while b < len( live_envs ): # don't want to do a for loop since live envs changes over time env_id = live_envs[b] o, r, d, env_info = self.envs[env_id].step(action[b]) traj_infos[env_id].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_traj_infos.append( traj_infos[env_id].terminate(o)) observation = delete_ind_from_array(observation, b) reward = delete_ind_from_array(reward, b) action = delete_ind_from_array(action, b) obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) del live_envs[b] b -= 1 # live_envs[b] is now the next env, so go back one. else: observation[b] = o reward[b] = r b += 1 if (self.max_trajectories is not None and len(completed_traj_infos) >= self.max_trajectories): logger.log("Evaluation reached max num trajectories " f"({self.max_trajectories}).") return completed_traj_infos if t == self.max_T - 1: logger.log("Evaluation reached max num time steps " f"({self.max_T}).") return completed_traj_infos
def sample_batch(self, batch_T): """Can dynamically input length of sequences to return, by ``batch_T``, else if ``None`` will use interanlly set value. Returns batch with leading dimensions ``[batch_T, batch_B]``. """ if self.t > batch_T: return torchify_buffer(self.samples[0:int(batch_T)]) # return torchify_buffer(self.samples[self.t-int(batch_T):self.t]) else: return torchify_buffer(self.samples[:self.t])
def _generate_stochastic_minibatches(self, replay_ratio): cum_sleep_length = 0 with self.rw_lock: self._async_pull() if not self._buffer_full: print('buffer not yet filled') return for minibatch in range(self.T_target): indexes = np.random.choice(self.buffer_size, self.optim_batch_B) with self.rw_lock: # Read lock. batch = self.samples[:, indexes] yield torchify_buffer(batch), torchify_buffer( self.samples_prev_rnn_state[indexes]), cum_sleep_length
def extract_batch(self, T_idxs, B_idxs, T): """Return full sequence of each field in `agent_inputs` (e.g. `observation`), including all timesteps for the main sequence and for the target sequence in one array; many timesteps will likely overlap, so the algorithm and make sub-sequences by slicing on device, for reduced memory usage. Enforces that input `T_idxs` align with RNN state interval. Uses helper function ``extract_sequences()`` to retrieve samples of length ``T`` starting at locations ``[T_idxs,B_idxs]``, so returned data batch has leading dimensions ``[T,len(B_idxs)]``.""" s, rsi = self.samples, self.rnn_state_interval if rsi > 1: assert np.all(np.asarray(T_idxs) % rsi == 0) init_rnn_state = self.samples_prev_rnn_state[T_idxs // rsi, B_idxs] elif rsi == 1: init_rnn_state = self.samples.prev_rnn_state[T_idxs, B_idxs] else: # rsi == 0 init_rnn_state = None batch = SamplesFromReplay( all_observation=self.extract_observation(T_idxs, B_idxs, T + self.n_step_return), all_action=buffer_func( s.action, extract_sequences, T_idxs - 1, B_idxs, T + self.n_step_return), # Starts at prev_action. all_reward=extract_sequences( s.reward, T_idxs - 1, B_idxs, T + self.n_step_return), # Only prev_reward (agent + target). return_=extract_sequences(self.samples_return_, T_idxs, B_idxs, T), done=extract_sequences(s.done, T_idxs, B_idxs, T), done_n=extract_sequences(self.samples_done_n, T_idxs, B_idxs, T), init_rnn_state=init_rnn_state, # (Same state for agent and target.) ) # NOTE: Algo might need to make zero prev_action/prev_reward depending on done. return torchify_buffer(batch)
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.)""" if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[: -1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[: -1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def sample_batch(self, batch_B): while True: try: self._async_pull() # Updates from writers. (T_idxs, B_idxs), priorities = self.priority_tree.sample( batch_B, unique=self.unique) sampled_indices = True if self.rnn_state_interval > 1: T_idxs = T_idxs * self.rnn_state_interval batch = self.extract_batch(T_idxs - 1, B_idxs, self.batch_T + 1) is_weights = (1. / (priorities + 1e-5))**self.beta is_weights /= max(is_weights) # Normalize. is_weights = torchify_buffer(is_weights).float() batch = SamplesFromReplayPri(*batch, is_weights=is_weights) return self.sanitize_batch(batch) except Exception as e: print("FAILED TO LOAD BATCH") traceback.print_exc() if sampled_indices: print("B_idxs:", B_idxs, flush=True) print("T_idxs:", T_idxs, flush=True) print("Batch_T:", self.batch_T, flush=True) print("Buffer T:", self.T, flush=True)
def get_example_outputs_single(agent, env, examples, subprocess=False): """For pre-batched environments""" if subprocess: # i.e. in subprocess. import torch torch.set_num_threads(1) # Some fix to prevent MKL hang. o = env.reset() a = env.action_space.sample() o, r, d, env_info = env.step(a) r = np.asarray(r, dtype="float32") # Must match torch float dtype here. agent.reset() agent_inputs = torchify_buffer(AgentInputs(o, a, r)) a, agent_info = agent.step(*agent_inputs) if "prev_rnn_state" in agent_info: # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H] prev_rnn_state = agent_info.prev_rnn_state[0] agent_info_0 = agent_info.__class__(*(i[0] for i in agent_info)) agent_info_0 = agent_info_0._replace(prev_rnn_state=prev_rnn_state) else: agent_info_0 = agent_info.__class__(*(i[0] for i in agent_info)) env_info_0 = env_info.__class__(*(i[0] for i in env_info)) examples["observation"] = o[0] examples["reward"] = r[0] examples["done"] = d[0] examples["env_info"] = env_info_0 examples["action"] = a[0] # OK to put torch tensor here, could numpify. examples["agent_info"] = agent_info_0
def get_example_outputs(agent, EnvCls, env_kwargs, examples, subprocess=False, env=None): """Do this in a sub-process to avoid setup conflict in master/workers (e.g. MKL).""" if subprocess: # i.e. in subprocess. import torch torch.set_num_threads(1) # Some fix to prevent MKL hang. if env is None: env = EnvCls(**env_kwargs) if not hasattr(env, 'spaces'): env = MVPWrapper(env) o = env.reset() a = env.action_space.sample() o, r, d, env_info = env.step(a) r = np.asarray(r, dtype="float32") # Must match torch float dtype here. agent.reset() agent_inputs = torchify_buffer(AgentInputs(o, a, r)) a, agent_info = agent.step(*agent_inputs) if "prev_rnn_state" in agent_info: # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H] agent_info = agent_info._replace( prev_rnn_state=agent_info.prev_rnn_state[0]) examples["observation"] = o examples["reward"] = r examples["done"] = d examples["env_info"] = env_info examples["action"] = a # OK to put torch tensor here, could numpify. examples["agent_info"] = agent_info
def collect_evaluation(self, itr): traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): self.traj_infos_queue.put(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() o = env.reset() if d: action[b] = 0 # Next prev_action. r = 0 self.agent.reset_one(idx=b) observation[b] = o reward[b] = r if self.sync.stop_eval.value: break self.traj_infos_queue.put(None) # End sentinel.
def extract_batch(self, T_idxs, B_idxs, T): """Return full sequence of each field which encompasses all subsequences to be used, so algorithm can make sub-sequences by slicing on device, for reduced memory usage.""" s, rsi = self.samples, self.rnn_state_interval if rsi > 1: assert np.all(np.asarray(T_idxs) % rsi == 0) init_rnn_state = self.samples_prev_rnn_state[T_idxs // rsi, B_idxs] elif rsi == 1: init_rnn_state = self.samples.prev_rnn_state[T_idxs, B_idxs] else: # rsi == 0 init_rnn_state = None batch = SamplesFromReplay( all_observation=self.extract_observation(T_idxs, B_idxs, T + self.n_step_return), all_action=buffer_func( s.action, extract_sequences, T_idxs - 1, B_idxs, T + self.n_step_return), # Starts at prev_action. all_reward=extract_sequences( s.reward, T_idxs - 1, B_idxs, T + self.n_step_return), # Only prev_reward (agent + target). return_=extract_sequences(self.samples_return_, T_idxs, B_idxs, T), done=extract_sequences(s.done, T_idxs, B_idxs, T), done_n=extract_sequences(self.samples_done_n, T_idxs, B_idxs, T), init_rnn_state=init_rnn_state, # (Same state for agent and target.) ) # NOTE: Algo might need to make zero prev_action/prev_reward depending on done. return torchify_buffer(batch)
def get_example_outputs(agent, env, examples, subprocess=False): """Do this in a sub-process to avoid setup conflict in master/workers (e.g. MKL).""" if subprocess: # i.e. in subprocess. import torch torch.set_num_threads(1) # Some fix to prevent MKL hang. o_reset = env.reset() a = env.action_space.sample() if a.shape == ( ): # 'a' gets stored, but if its array(3) you want step(3) for mario action = int(a) else: action = a o, r, d, env_info = env.step(action) r = np.asarray(r, dtype="float32") # Must match torch float dtype here. agent.reset() agent_inputs = torchify_buffer(AgentInputs(o, a, r)) a, agent_info = agent.step(*agent_inputs) if "prev_rnn_state" in agent_info: # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H] agent_info = agent_info._replace( prev_rnn_state=agent_info.prev_rnn_state[0]) examples["observation"] = o_reset examples["reward"] = r examples["done"] = d examples["env_info"] = env_info examples["action"] = a # OK to put torch tensor here, could numpify. examples["agent_info"] = agent_info
def _get_example_outputs(self): examples = dict() o = self.env.reset() a = np.stack( [self.env.action_space.sample() for _ in range(self.batch_spec.B)], axis=0) o, r, d, env_info = self.env.step(a) a = np.asarray(a[0]) # get first batch only o = o[0] # get first batch only r = np.asarray( r[0], dtype="float32" ) # get first batch only, Must match torch float dtype here. self.agent.reset() agent_inputs = torchify_buffer(AgentInputs(o, a, r)) a, agent_info = self.agent.step(*agent_inputs) if "prev_rnn_state" in agent_info: # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H] agent_info = agent_info._replace( prev_rnn_state=agent_info.prev_rnn_state[0]) examples["observation"] = o examples["reward"] = r examples["done"] = d examples["env_info"] = env_info examples["action"] = a # OK to put torch tensor here, could numpify. examples["agent_info"] = agent_info return examples
def collect_batch(self, agent_inputs, traj_infos, itr): # Numpy arrays can be written to from numpy arrays or torch tensors # (whereas torch tensors can only be written to from torch tensors). agent_buf, env_buf = self.samples_np.agent, self.samples_np.env completed_infos = list() observation, action, reward = agent_inputs b = np.where(self.done)[0] observation[b] = self.temp_observation[b] self.done[:] = False # Did resets between batches. obs_pyt, act_pyt, rew_pyt = torchify_buffer(agent_inputs) agent_buf.prev_action[0] = action # Leading prev_action. if env_buf.prev_reward[0].ndim > reward.ndim: reward = reward[:, None].repeat(env_buf.prev_reward[0].shape[-1], -1) env_buf.prev_reward[0] = reward self.agent.sample_mode(itr) for t in range(self.batch_T): env_buf.observation[t] = observation # Agent inputs and outputs are torch tensors. act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): if self.done[b]: action[b] = 0 # Record blank. reward[b] = 0 if agent_info: agent_info[b] = 0 # Leave self.done[b] = True, record that. continue # Environment inputs and outputs are numpy arrays. o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_infos.append(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() self.need_reset[b] = True if d: self.temp_observation[b] = o o = 0 # Record blank. observation[b] = o reward[b] = r self.done[b] = d if env_info: env_buf.env_info[t, b] = env_info agent_buf.action[t] = action env_buf.reward[t] = reward env_buf.done[t] = self.done if agent_info: agent_buf.agent_info[t] = agent_info if "bootstrap_value" in agent_buf: # agent.value() should not advance rnn state. agent_buf.bootstrap_value[:] = self.agent.value( obs_pyt, act_pyt, rew_pyt) return AgentInputs(observation, action, reward), traj_infos, completed_infos
def sample_batch(self, batch_B): while True: try: self._async_pull() # Updates from writers. (T_idxs, B_idxs), priorities = self.priority_tree.sample( batch_B, unique=self.unique) sampled_indices = True if self.rnn_state_interval > 1: T_idxs = T_idxs * self.rnn_state_interval batch = self.extract_batch(T_idxs, B_idxs, self.batch_T) except Exception as _: print("FAILED TO LOAD BATCH") traceback.print_exc() if sampled_indices: print("B_idxs:", B_idxs, flush=True) print("T_idxs:", T_idxs, flush=True) print("Batch_T:", self.batch_T, flush=True) print("Buffer T:", self.T, flush=True) is_weights = (1. / (priorities + 1e-5)) ** self.beta is_weights /= max(is_weights) # Normalize. is_weights = torchify_buffer(is_weights).float() elapsed_iters = self.t + self.T - T_idxs % self.T elapsed_samples = self.B*(elapsed_iters) values = torch.from_numpy(extract_sequences(self.samples.value, T_idxs, B_idxs, self.batch_T+self.n_step_return+1)) batch = SamplesFromReplayPriExt(*batch, values=values, is_weights=is_weights, age=elapsed_samples) if self.batch_T > 1: batch = self.sanitize_batch(batch) return batch
def extract_batch(self, T_idxs, B_idxs): """From buffer locations `[T_idxs,B_idxs]`, extract data needed for training, including target values at `T_idxs + n_step_return`. Returns namedarraytuple of torch tensors (see file for all fields). Each tensor has leading batch dimension ``len(T_idxs)==len(B_idxs)``, but individual samples are drawn, so no leading time dimension.""" s = self.samples target_T_idxs = (T_idxs + self.n_step_return) % self.T batch = SamplesFromReplay( agent_inputs=AgentInputs( observation=self.extract_observation(T_idxs, B_idxs), prev_action=s.action[T_idxs - 1, B_idxs], prev_reward=s.reward[T_idxs - 1, B_idxs], ), action=s.action[T_idxs, B_idxs], return_=self.samples_return_[T_idxs, B_idxs], done=self.samples.done[T_idxs, B_idxs], done_n=self.samples_done_n[T_idxs, B_idxs], target_inputs=AgentInputs( observation=self.extract_observation(target_T_idxs, B_idxs), prev_action=s.action[target_T_idxs - 1, B_idxs], prev_reward=s.reward[target_T_idxs - 1, B_idxs], ), ) # target_... means what happend after self.n_step_return timestep # It serve as target for predicting the n-step return. t_news = np.where(s.done[T_idxs - 1, B_idxs])[0] batch.agent_inputs.prev_action[t_news] = 0 batch.agent_inputs.prev_reward[t_news] = 0 return torchify_buffer(batch)
def get_example_outputs(agent, env, examples, subprocess=False): """Do this in a sub-process to avoid setup conflict in master/workers (e.g. MKL). 在一个重置的environment中(从头开始),随机采取一个action,把得到的observation,reward等数据记录下来,保存到输入的examples里返回。 注意:虽然输入的examples看上去名字是复数,但实际上,返回的并不是在environment中走多步的结果,而仅仅是走一步产生的结果。这个变量命名不好, 我认为叫"example"更合理。 :param agent: 一个agent类的对象。 :param env: 一个environment类的对象。 :param examples: 同时作为input和output。输入的有可能是一个空的dict,输出的是经过填充过的内容。 :param subprocess: 是否是在子进程中执行。 :return: 没有返回值,但需要返回的数据放在了输入的examples变量中返回。 """ if subprocess: # i.e. in subprocess. import torch torch.set_num_threads(1) # Some fix to prevent MKL hang. o = env.reset() # 重置environment,从头开始 a = env.action_space.sample() # 随机选择action space内的一个index o, r, d, env_info = env.step(a) # 根据选择的action(的index),在environment中步进(step) r = np.asarray(r, dtype="float32") # Must match torch float dtype here. 把reward转成float32类型 agent.reset() agent_inputs = torchify_buffer(AgentInputs(o, a, r)) a, agent_info = agent.step(*agent_inputs) # 星号把agent_inputs这一个tuple展开成step()函数所需的3个参数输入 if "prev_rnn_state" in agent_info: # Agent leaves B dimension in, strip it: [B,N,H] --> [N,H] agent_info = agent_info._replace(prev_rnn_state=agent_info.prev_rnn_state[0]) examples["observation"] = o examples["reward"] = r # 只含有一个数的NumPy array examples["done"] = d # bool examples["env_info"] = env_info # EnvInfo类型的对象 examples["action"] = a # OK to put torch tensor here, could numpify. examples["agent_info"] = agent_info
def collect_evaluation(self, itr): traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] completed_traj_infos = list() observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) action = buffer_from_example( self.envs[0].action_space.sample(null=True), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_traj_infos.append(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() o = env.reset() if d: action[b] = 0 # Prev_action for next step. r = 0 self.agent.reset_one(idx=b) observation[b] = o reward[b] = r if (self.max_trajectories is not None and len(completed_traj_infos) >= self.max_trajectories): break return completed_traj_infos
def extract_batch(self, T_idxs, B_idxs): batch = super().extract_batch(T_idxs, B_idxs) batch = SamplesFromReplayTL( *batch, timeout=self.samples.timeout[T_idxs, B_idxs], timeout_n=self.samples_timeout_n[T_idxs, B_idxs], ) return torchify_buffer(batch)
def build_step_buffer(examples, B): step_bufs = { k: buffer_from_example(examples[k], B, share_memory=True) for k in ["observation", "action", "reward", "done", "agent_info"] } step_buffer_np = StepBuffer(**step_bufs) step_buffer_pyt = torchify_buffer(step_buffer_np) return step_buffer_pyt, step_buffer_np
def build_step_buffer(examples, B): bufs = tuple( buffer_from_example(examples[k], B, shared_memory=True) for k in ["observation", "action", "reward", "done", "agent_info"]) need_reset = buffer_from_example(examples["done"], B, shared_memory=True) step_buffer_np = StepBuffer(*bufs, need_reset) step_buffer_pyt = torchify_buffer(step_buffer_np) return step_buffer_pyt, step_buffer_np
def extract_batch(self, T_idxs, B_idxs, T): s = self.samples batch = SamplesFromReplay( observation=self.extract_observation(T_idxs, B_idxs, T), action=buffer_func(s.action, extract_sequences, T_idxs, B_idxs, T), reward=extract_sequences(s.reward, T_idxs, B_idxs, T), done=extract_sequences(s.done, T_idxs, B_idxs, T), ) return torchify_buffer(batch)
def sample_batch(self, batch_B): (T_idxs, B_idxs), priorities = self.priority_tree.sample(batch_B, unique=self.unique) batch = self.extract_batch(T_idxs, B_idxs) is_weights = (1. / (priorities + EPS))**self.beta # Unnormalized. is_weights /= max(is_weights) # Normalize. is_weights = torchify_buffer(is_weights).float() return SamplesFromReplayPri(*batch, is_weights=is_weights)
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.)""" # import ipdb; ipdb.set_trace() if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[:-1] # Writing to action will populate prev_action. # import ipdb; ipdb.set_trace() agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: if agent.dual_model: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) int_bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsvTwin(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv) else: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) # all zero arrays (except 0th index should equal o_reset) next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) # all zero values reward = all_reward[1:] prev_reward = all_reward[:-1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, next_observation=next_observation, prev_reward=prev_reward, reward=reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) # this links the two (changes to samples_np will reflect in samples_pyt) return samples_pyt, samples_np, examples
def _generate_deterministic_batches(self): # replay ratioo of 1 with deterministic batch selection cum_sleep_length = 0 for i in range(self.T_target): while True: with self.rw_lock: # get read lock self._async_pull() if self.t >= self.optim_batch_B * (i + 1) or self._buffer_full: break time.sleep(self.sleep_length) cum_sleep_length += self.sleep_length if i > 0 else 0 # batch is available indexes = np.arange(i * self.optim_batch_B, (i + 1) * self.optim_batch_B) with self.rw_lock: # Read lock. batch = self.samples[:, indexes] yield torchify_buffer(batch), torchify_buffer( self.samples_prev_rnn_state[indexes]), cum_sleep_length
def sample_batch(self, batch_B): (tree_T_idxs, B_idxs), priorities = self.priority_tree.sample( batch_B, unique=self.unique) if self.rnn_state_interval > 1: T_idxs = tree_T_idxs * self.rnn_state_interval batch = self.extract_batch(T_idxs, B_idxs, self.batch_T) is_weights = (1. / priorities) ** self.beta is_weights /= max(is_weights) # Normalize. is_weights = torchify_buffer(is_weights).float() return SamplesFromReplayPri(*batch, is_weights=is_weights)
def obtain_samples(self, itr, mode='sample'): agent_buf, env_buf = self.samples_np.agent, self.samples_np.env # Reset agent inputs observation, action, reward = self.agent_inputs obs_pyt, act_pyt, rew_pyt = torchify_buffer(self.agent_inputs) action[:], reward[:] = self.env.action_space.null_value( ), 0 # reset agent inputs # reset environment and agent observation[:] = self.env.reset() self.agent.reset() agent_buf.prev_action[0], env_buf.prev_reward[ 0] = action, reward # Leading prev_action. # perform episode if mode == 'sample': self.agent.sample_mode(itr) elif mode == 'eval': self.agent.eval_mode(itr) traj_infos = [ self.TrajInfoCls(**self.traj_info_kwargs) for _ in range(self.batch_spec.B) ] for t in range(self.batch_spec.T): env_buf.observation[t] = observation act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer( act_pyt ) # todo why doing this? they are sharing the same memory o, r, _, env_info = self.env.step(action) d = (t == self.batch_spec.T - 1) for b in range(self.batch_spec.B): traj_infos[b].step(observation[b], action[b], r[b], d, agent_info[b], env_info) if env_info: env_buf.env_info[t, b] = env_info observation[:] = o reward[:] = r agent_buf.action[t] = action env_buf.reward[t] = reward if agent_info: agent_buf.agent_info[t] = agent_info if "bootstrap_value" in agent_buf: agent_buf.bootstrap_value[:] = self.agent.value( obs_pyt, act_pyt, rew_pyt) return self.samples_pyt, traj_infos
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.) :param agent: 一个Agent类的对象。 :param env: 一个environment类的对象。 :param batch_spec: 一个BatchSpec类的对象。 """ if examples is None: if subprocess: # 创建子进程 mgr = mp.Manager() # Manager模块用于资源共享 examples = mgr.dict() # Examples pickled back to master. 可以被子进程共享的全局变量 w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) # 创建worker进程,此进程执行的是target指定的函数,参数由args指定 w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) # examples会在get_example_outputs()函数中被更新,所以没有返回值 T, B = batch_spec # time step数,以及environment实例数 all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[:-1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[:-1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def sample_batch(self, batch_B): """Returns batch with leading dimensions ``[self.batch_T, batch_B]``, with each sequence sampled randomly according to priority. (``self.batch_T`` should not be changed).""" (T_idxs, B_idxs), priorities = self.priority_tree.sample( batch_B, unique=self.unique) if self.rnn_state_interval > 1: T_idxs = T_idxs * self.rnn_state_interval batch = self.extract_batch(T_idxs, B_idxs, self.batch_T) is_weights = (1. / priorities) ** self.beta is_weights /= max(is_weights) # Normalize. is_weights = torchify_buffer(is_weights).float() return SamplesFromReplayPri(*batch, is_weights=is_weights)
def policy(time_step): obs = dmc_wrapper.convertObservation(time_step.observation) reward = time_step.reward reward = np.asarray(reward) if reward is not None else reward obs_pyt, act_pyt, rew_pyt = torchify_buffer( (obs, get_prev_action(), reward)) # obs_pyt, rew_pyt = torchify_buffer((obs, reward)) act_pyt, agent_info = agent.step(obs_pyt.float(), act_pyt, rew_pyt) # prev_action = act_pyt return act_pyt
def sample_batch(self, batch_B): """Calls on the priority tree to generate random samples. Returns samples data and normalized importance-sampling weights: ``is_weights=priorities ** -beta`` """ (T_idxs, B_idxs), priorities = self.priority_tree.sample(batch_B, unique=self.unique) batch = self.extract_batch(T_idxs, B_idxs) is_weights = (1. / (priorities + EPS))**self.beta # Unnormalized. is_weights /= max(is_weights) # Normalize. is_weights = torchify_buffer(is_weights).float() return SamplesFromReplayPri(*batch, is_weights=is_weights)