def async_initialize(self, agent, bootstrap_value=False, traj_info_kwargs=None, seed=None): """Instantiate an example environment and use it to initialize the agent (on shared memory). Pre-allocate a double-buffer for sample batches, and return that buffer along with example data (e.g. `observation`, `action`, etc.) """ self.seed = make_seed() if seed is None else seed # Construct an example of each kind of data that needs to be stored. env = self.EnvCls(**self.env_kwargs) # Sampler always receives new params through shared memory: agent.initialize( env.spaces, share_memory=True, global_B=self.batch_spec.B, env_ranks=list(range(self.batch_spec.B)), ) _, samples_np, examples = build_samples_buffer( agent, env, self.batch_spec, bootstrap_value, agent_shared=True, env_shared=True, subprocess=True, ) # Would like subprocess=True, but might hang? _, samples_np2, _ = build_samples_buffer( agent, env, self.batch_spec, bootstrap_value, agent_shared=True, env_shared=True, subprocess=True, ) env.close() del env if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) self.double_buffer = double_buffer = (samples_np, samples_np2) self.samples_np = samples_np # In case leftover use during worker init. self.examples = examples self.agent = agent return double_buffer, examples
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): B = self.batch_spec.B envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)] global_B = B * world_size env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(envs[0].spaces, share_memory=False, global_B=global_B, env_ranks=env_ranks) samples_pyt, samples_np, examples = build_samples_buffer(agent, envs[0], self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False) if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) # Avoid passing at init. collector = self.CollectorCls( rank=0, envs=envs, samples_np=samples_np, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=agent, global_B=global_B, env_ranks=env_ranks, # Might get applied redundantly to agent. ) if self.eval_n_envs > 0: # May do evaluation. eval_envs = [self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs)] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.agent = agent self.samples_pyt = samples_pyt self.samples_np = samples_np self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos logger.log("Serial Sampler initialized.") return examples
def _build_buffers(self, env, bootstrap_value): self.samples_pyt, self.samples_np, examples = build_samples_buffer( self.agent, env, self.batch_spec, bootstrap_value, agent_shared=True, env_shared=True, subprocess=True) return examples
def async_initialize(self, agent, bootstrap_value=False, traj_info_kwargs=None, seed=None): self.seed = make_seed() if seed is None else seed # Construct an example of each kind of data that needs to be stored. env = self.EnvCls(**self.env_kwargs) # Sampler always receives new params through shared memory: agent.initialize(env.spaces, share_memory=True, global_B=self.batch_spec.B, env_ranks=list(range(self.batch_spec.B))) _, samples_np, examples = build_samples_buffer( agent, env, self.batch_spec, bootstrap_value, agent_shared=True, env_shared=True, subprocess=True) # Would like subprocess=True, but might hang? _, samples_np2, _ = build_samples_buffer(agent, env, self.batch_spec, bootstrap_value, agent_shared=True, env_shared=True, subprocess=True) env.close() del env if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) self.double_buffer = double_buffer = (samples_np, samples_np2) self.samples_np = samples_np # In case leftover use during worker init. self.examples = examples self.agent = agent return double_buffer, examples
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): """Should instantiate all components, including setup of parallel process if applicable.""" B = self.batch_spec.B global_B = B * world_size env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(self.env.spaces, share_memory=False, global_B=global_B, env_ranks=env_ranks) self.env.seed(seed) examples = dict() get_example_outputs_single(agent, self.env, examples, subprocess=False) samples_pyt, samples_np, examples = build_samples_buffer( agent, self.env, self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False, examples=examples) self.samples_pyt = samples_pyt self.samples_np = samples_np if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) # Avoid passing at init. setattr(self.ReturnTrajInfoCls, "_" + k, v) self.agent_inputs, self.traj_infos = self._decorrelate_envs() # Collector calls start_agent here, but doesn't apply self.agent = agent logger.log("Pomdp Sampler initialized.") return examples
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): assert world_size == 1 # world size used in async samplers, not relevant for this class T, B = self.batch_spec self.agent = agent self.env = self.EnvCls(batch_T=T, batch_B=B, **self.env_kwargs) env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(self.env.spaces, share_memory=False, global_B=B, env_ranks=env_ranks) self.samples_pyt, self.samples_np, examples = build_samples_buffer( agent, self.env, self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False, examples=self._get_example_outputs()) self.samples_np.env.done[:-1, :] = False self.samples_np.env.done[-1, :] = True self.traj_info_kwargs = traj_info_kwargs self.agent_inputs = AgentInputs( buffer_from_example(examples["observation"], (B, )), buffer_from_example(examples["action"], (B, )), buffer_from_example(examples["reward"], (B, ))) self._start_agent(B, env_ranks) logger.log("BatchedEpisodicSampler initialized.") return examples
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): """Store the input arguments. Instantiate the specified number of environment instances (``batch_B``). Initialize the agent, and pre-allocate a memory buffer to hold the samples collected in each batch. Applies ``traj_info_kwargs`` settings to the `TrajInfoCls` by direct class attribute assignment. Instantiates the Collector and, if applicable, the evaluation Collector. Returns a structure of inidividual examples for data fields such as `observation`, `action`, etc, which can be used to allocate a replay buffer. """ B = self.batch_spec.B envs = [self.EnvCls(id=i, **self.env_kwargs) for i in range(B)] global_B = B * world_size env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(envs[0].spaces, share_memory=False, global_B=global_B, env_ranks=env_ranks) samples_pyt, samples_np, examples = build_samples_buffer( agent, envs[0], self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False) if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) # Avoid passing at init. collector = self.CollectorCls( rank=0, envs=envs, samples_np=samples_np, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=agent, global_B=global_B, env_ranks=env_ranks, # Might get applied redundantly to agent. ) if self.eval_n_envs > 0: # May do evaluation. eval_envs = [ self.EnvCls(id=i, **self.eval_env_kwargs) for i in range(self.eval_n_envs) ] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.agent = agent self.samples_pyt = samples_pyt self.samples_np = samples_np self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos logger.log("Serial Sampler initialized.") return examples
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): """ initialize()方法会在runner类(例如MinibatchRlBase)的startup()方法里被调用。 """ B = self.batch_spec.B # 独立的trajectory的数量,即environment实例的数量。此值>=1 envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)] # 初始化每一个environment实例,生成一个list global_B = B * world_size # 这里的概念可能是指复制出来的N个environment实例的数量 env_ranks = list(range(rank * B, (rank + 1) * B)) # size为[environment数量]的一个list """ 由于每一个environment的spaces都是一样的(这里是指action space 和 observation space),因此只需要拿一个environment的实例出来, 即 envs[0],再取其spaces,也就代表了每一个environment的spaces。这里的 .spaces 是被作为一个属性来使用,但实际上它是一个函数,在 class Env里面,用@property来修饰使之可以用属性的方式调用。总结:envs[0].spaces 得到的是一个namedtuple(EnvSpaces),其包含两个 属性:observation space 和 action space。 """ agent.initialize(envs[0].spaces, share_memory=False, global_B=global_B, env_ranks=env_ranks) samples_pyt, samples_np, examples = build_samples_buffer( agent, envs[0], self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False) if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) # Avoid passing at init. """ 对env_ranks,collector.start_agent()和上面的agent.initialize()都会调用到 EpsilonGreedyAgentMixin.make_vec_eps(),这 其实是重复执行了一部分逻辑,所以作者会在collector的构造函数这里加上这句注释:Might get applied redundantly to agent. """ collector = self.CollectorCls( rank=0, envs=envs, samples_np=samples_np, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=agent, global_B=global_B, env_ranks=env_ranks, # Might get applied redundantly to agent. ) if self.eval_n_envs > 0: # May do evaluation. eval_envs = [ self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs) ] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, # 先做除法再向下取整。计算时间步的数量,这里有点计算"平均"时间步的意思 max_trajectories=self.eval_max_trajectories, ) """ 收集(即采样)第一批数据(observation,action,reward等),以及所有environment对应的所有trajectory的信息(含reward等统计值)。这里 之所以要收集第一批数据并保存到类成员变量 self.agent_inputs 中,是因为现在是Sampler初始化过程,当开始连续收集数据的时候,会在第一批 数据的基础上step下去,因此就把获取第一批数据的工作放到了这里。 """ agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.agent = agent # self.agent在本类中没有用,但尚未确定在外面使用SerialSampler的时候会不会使用 self.samples_pyt = samples_pyt # PyTorch数据格式(即底层是torch.Tensor)的samples self.samples_np = samples_np # numpy数据格式(即底层是numpy array)的samples self.collector = collector # sample收集器 self.agent_inputs = agent_inputs self.traj_infos = traj_infos # 一个list logger.log("Serial Sampler initialized.") return examples