Esempio n. 1
0
 def async_initialize(self,
                      agent,
                      bootstrap_value=False,
                      traj_info_kwargs=None,
                      seed=None):
     """Instantiate an example environment and use it to initialize the
     agent (on shared memory).  Pre-allocate a double-buffer for sample
     batches, and return that buffer along with example data (e.g.
     `observation`, `action`, etc.)
     """
     self.seed = make_seed() if seed is None else seed
     # Construct an example of each kind of data that needs to be stored.
     env = self.EnvCls(**self.env_kwargs)
     # Sampler always receives new params through shared memory:
     agent.initialize(
         env.spaces,
         share_memory=True,
         global_B=self.batch_spec.B,
         env_ranks=list(range(self.batch_spec.B)),
     )
     _, samples_np, examples = build_samples_buffer(
         agent,
         env,
         self.batch_spec,
         bootstrap_value,
         agent_shared=True,
         env_shared=True,
         subprocess=True,
     )  # Would like subprocess=True, but might hang?
     _, samples_np2, _ = build_samples_buffer(
         agent,
         env,
         self.batch_spec,
         bootstrap_value,
         agent_shared=True,
         env_shared=True,
         subprocess=True,
     )
     env.close()
     del env
     if traj_info_kwargs:
         for k, v in traj_info_kwargs.items():
             setattr(self.TrajInfoCls, "_" + k, v)
     self.double_buffer = double_buffer = (samples_np, samples_np2)
     self.samples_np = samples_np  # In case leftover use during worker init.
     self.examples = examples
     self.agent = agent
     return double_buffer, examples
    def initialize(
            self,
            agent,
            affinity=None,
            seed=None,
            bootstrap_value=False,
            traj_info_kwargs=None,
            rank=0,
            world_size=1,
            ):
        B = self.batch_spec.B
        envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)]
        global_B = B * world_size
        env_ranks = list(range(rank * B, (rank + 1) * B))
        agent.initialize(envs[0].spaces, share_memory=False,
            global_B=global_B, env_ranks=env_ranks)
        samples_pyt, samples_np, examples = build_samples_buffer(agent, envs[0],
            self.batch_spec, bootstrap_value, agent_shared=False,
            env_shared=False, subprocess=False)
        if traj_info_kwargs:
            for k, v in traj_info_kwargs.items():
                setattr(self.TrajInfoCls, "_" + k, v)  # Avoid passing at init.
        collector = self.CollectorCls(
            rank=0,
            envs=envs,
            samples_np=samples_np,
            batch_T=self.batch_spec.T,
            TrajInfoCls=self.TrajInfoCls,
            agent=agent,
            global_B=global_B,
            env_ranks=env_ranks,  # Might get applied redundantly to agent.
        )
        if self.eval_n_envs > 0:  # May do evaluation.
            eval_envs = [self.EnvCls(**self.eval_env_kwargs)
                for _ in range(self.eval_n_envs)]
            eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector
            self.eval_collector = eval_CollectorCls(
                envs=eval_envs,
                agent=agent,
                TrajInfoCls=self.TrajInfoCls,
                max_T=self.eval_max_steps // self.eval_n_envs,
                max_trajectories=self.eval_max_trajectories,
            )

        agent_inputs, traj_infos = collector.start_envs(
            self.max_decorrelation_steps)
        collector.start_agent()

        self.agent = agent
        self.samples_pyt = samples_pyt
        self.samples_np = samples_np
        self.collector = collector
        self.agent_inputs = agent_inputs
        self.traj_infos = traj_infos
        logger.log("Serial Sampler initialized.")
        return examples
Esempio n. 3
0
 def _build_buffers(self, env, bootstrap_value):
     self.samples_pyt, self.samples_np, examples = build_samples_buffer(
         self.agent,
         env,
         self.batch_spec,
         bootstrap_value,
         agent_shared=True,
         env_shared=True,
         subprocess=True)
     return examples
 def async_initialize(self,
                      agent,
                      bootstrap_value=False,
                      traj_info_kwargs=None,
                      seed=None):
     self.seed = make_seed() if seed is None else seed
     # Construct an example of each kind of data that needs to be stored.
     env = self.EnvCls(**self.env_kwargs)
     # Sampler always receives new params through shared memory:
     agent.initialize(env.spaces,
                      share_memory=True,
                      global_B=self.batch_spec.B,
                      env_ranks=list(range(self.batch_spec.B)))
     _, samples_np, examples = build_samples_buffer(
         agent,
         env,
         self.batch_spec,
         bootstrap_value,
         agent_shared=True,
         env_shared=True,
         subprocess=True)  # Would like subprocess=True, but might hang?
     _, samples_np2, _ = build_samples_buffer(agent,
                                              env,
                                              self.batch_spec,
                                              bootstrap_value,
                                              agent_shared=True,
                                              env_shared=True,
                                              subprocess=True)
     env.close()
     del env
     if traj_info_kwargs:
         for k, v in traj_info_kwargs.items():
             setattr(self.TrajInfoCls, "_" + k, v)
     self.double_buffer = double_buffer = (samples_np, samples_np2)
     self.samples_np = samples_np  # In case leftover use during worker init.
     self.examples = examples
     self.agent = agent
     return double_buffer, examples
Esempio n. 5
0
 def initialize(
     self,
     agent,
     affinity=None,
     seed=None,
     bootstrap_value=False,
     traj_info_kwargs=None,
     rank=0,
     world_size=1,
 ):
     """Should instantiate all components, including setup of parallel
     process if applicable."""
     B = self.batch_spec.B
     global_B = B * world_size
     env_ranks = list(range(rank * B, (rank + 1) * B))
     agent.initialize(self.env.spaces,
                      share_memory=False,
                      global_B=global_B,
                      env_ranks=env_ranks)
     self.env.seed(seed)
     examples = dict()
     get_example_outputs_single(agent, self.env, examples, subprocess=False)
     samples_pyt, samples_np, examples = build_samples_buffer(
         agent,
         self.env,
         self.batch_spec,
         bootstrap_value,
         agent_shared=False,
         env_shared=False,
         subprocess=False,
         examples=examples)
     self.samples_pyt = samples_pyt
     self.samples_np = samples_np
     if traj_info_kwargs:
         for k, v in traj_info_kwargs.items():
             setattr(self.TrajInfoCls, "_" + k, v)  # Avoid passing at init.
             setattr(self.ReturnTrajInfoCls, "_" + k, v)
     self.agent_inputs, self.traj_infos = self._decorrelate_envs()
     # Collector calls start_agent here, but doesn't apply
     self.agent = agent
     logger.log("Pomdp Sampler initialized.")
     return examples
Esempio n. 6
0
    def initialize(
        self,
        agent,
        affinity=None,
        seed=None,
        bootstrap_value=False,
        traj_info_kwargs=None,
        rank=0,
        world_size=1,
    ):
        assert world_size == 1  # world size used in async samplers, not relevant for this class

        T, B = self.batch_spec
        self.agent = agent
        self.env = self.EnvCls(batch_T=T, batch_B=B, **self.env_kwargs)
        env_ranks = list(range(rank * B, (rank + 1) * B))
        agent.initialize(self.env.spaces,
                         share_memory=False,
                         global_B=B,
                         env_ranks=env_ranks)
        self.samples_pyt, self.samples_np, examples = build_samples_buffer(
            agent,
            self.env,
            self.batch_spec,
            bootstrap_value,
            agent_shared=False,
            env_shared=False,
            subprocess=False,
            examples=self._get_example_outputs())

        self.samples_np.env.done[:-1, :] = False
        self.samples_np.env.done[-1, :] = True
        self.traj_info_kwargs = traj_info_kwargs

        self.agent_inputs = AgentInputs(
            buffer_from_example(examples["observation"], (B, )),
            buffer_from_example(examples["action"], (B, )),
            buffer_from_example(examples["reward"], (B, )))
        self._start_agent(B, env_ranks)
        logger.log("BatchedEpisodicSampler initialized.")
        return examples
Esempio n. 7
0
    def initialize(
        self,
        agent,
        affinity=None,
        seed=None,
        bootstrap_value=False,
        traj_info_kwargs=None,
        rank=0,
        world_size=1,
    ):
        """Store the input arguments.  Instantiate the specified number of environment
        instances (``batch_B``).  Initialize the agent, and pre-allocate a memory buffer
        to hold the samples collected in each batch.  Applies ``traj_info_kwargs`` settings
        to the `TrajInfoCls` by direct class attribute assignment.  Instantiates the Collector
        and, if applicable, the evaluation Collector.

        Returns a structure of inidividual examples for data fields such as `observation`,
        `action`, etc, which can be used to allocate a replay buffer.
        """
        B = self.batch_spec.B
        envs = [self.EnvCls(id=i, **self.env_kwargs) for i in range(B)]
        global_B = B * world_size
        env_ranks = list(range(rank * B, (rank + 1) * B))
        agent.initialize(envs[0].spaces,
                         share_memory=False,
                         global_B=global_B,
                         env_ranks=env_ranks)
        samples_pyt, samples_np, examples = build_samples_buffer(
            agent,
            envs[0],
            self.batch_spec,
            bootstrap_value,
            agent_shared=False,
            env_shared=False,
            subprocess=False)
        if traj_info_kwargs:
            for k, v in traj_info_kwargs.items():
                setattr(self.TrajInfoCls, "_" + k, v)  # Avoid passing at init.
        collector = self.CollectorCls(
            rank=0,
            envs=envs,
            samples_np=samples_np,
            batch_T=self.batch_spec.T,
            TrajInfoCls=self.TrajInfoCls,
            agent=agent,
            global_B=global_B,
            env_ranks=env_ranks,  # Might get applied redundantly to agent.
        )
        if self.eval_n_envs > 0:  # May do evaluation.
            eval_envs = [
                self.EnvCls(id=i, **self.eval_env_kwargs)
                for i in range(self.eval_n_envs)
            ]
            eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector
            self.eval_collector = eval_CollectorCls(
                envs=eval_envs,
                agent=agent,
                TrajInfoCls=self.TrajInfoCls,
                max_T=self.eval_max_steps // self.eval_n_envs,
                max_trajectories=self.eval_max_trajectories,
            )

        agent_inputs, traj_infos = collector.start_envs(
            self.max_decorrelation_steps)
        collector.start_agent()

        self.agent = agent
        self.samples_pyt = samples_pyt
        self.samples_np = samples_np
        self.collector = collector
        self.agent_inputs = agent_inputs
        self.traj_infos = traj_infos
        logger.log("Serial Sampler initialized.")
        return examples
Esempio n. 8
0
    def initialize(
        self,
        agent,
        affinity=None,
        seed=None,
        bootstrap_value=False,
        traj_info_kwargs=None,
        rank=0,
        world_size=1,
    ):
        """
        initialize()方法会在runner类(例如MinibatchRlBase)的startup()方法里被调用。
        """
        B = self.batch_spec.B  # 独立的trajectory的数量,即environment实例的数量。此值>=1
        envs = [self.EnvCls(**self.env_kwargs)
                for _ in range(B)]  # 初始化每一个environment实例,生成一个list
        global_B = B * world_size  # 这里的概念可能是指复制出来的N个environment实例的数量
        env_ranks = list(range(rank * B,
                               (rank + 1) * B))  # size为[environment数量]的一个list
        """
        由于每一个environment的spaces都是一样的(这里是指action space 和 observation space),因此只需要拿一个environment的实例出来,
        即 envs[0],再取其spaces,也就代表了每一个environment的spaces。这里的 .spaces 是被作为一个属性来使用,但实际上它是一个函数,在
        class Env里面,用@property来修饰使之可以用属性的方式调用。总结:envs[0].spaces 得到的是一个namedtuple(EnvSpaces),其包含两个
        属性:observation space 和 action space。 
        """
        agent.initialize(envs[0].spaces,
                         share_memory=False,
                         global_B=global_B,
                         env_ranks=env_ranks)
        samples_pyt, samples_np, examples = build_samples_buffer(
            agent,
            envs[0],
            self.batch_spec,
            bootstrap_value,
            agent_shared=False,
            env_shared=False,
            subprocess=False)
        if traj_info_kwargs:
            for k, v in traj_info_kwargs.items():
                setattr(self.TrajInfoCls, "_" + k, v)  # Avoid passing at init.
        """
        对env_ranks,collector.start_agent()和上面的agent.initialize()都会调用到 EpsilonGreedyAgentMixin.make_vec_eps(),这
        其实是重复执行了一部分逻辑,所以作者会在collector的构造函数这里加上这句注释:Might get applied redundantly to agent.
        """
        collector = self.CollectorCls(
            rank=0,
            envs=envs,
            samples_np=samples_np,
            batch_T=self.batch_spec.T,
            TrajInfoCls=self.TrajInfoCls,
            agent=agent,
            global_B=global_B,
            env_ranks=env_ranks,  # Might get applied redundantly to agent.
        )
        if self.eval_n_envs > 0:  # May do evaluation.
            eval_envs = [
                self.EnvCls(**self.eval_env_kwargs)
                for _ in range(self.eval_n_envs)
            ]
            eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector
            self.eval_collector = eval_CollectorCls(
                envs=eval_envs,
                agent=agent,
                TrajInfoCls=self.TrajInfoCls,
                max_T=self.eval_max_steps //
                self.eval_n_envs,  # 先做除法再向下取整。计算时间步的数量,这里有点计算"平均"时间步的意思
                max_trajectories=self.eval_max_trajectories,
            )
        """
        收集(即采样)第一批数据(observation,action,reward等),以及所有environment对应的所有trajectory的信息(含reward等统计值)。这里
        之所以要收集第一批数据并保存到类成员变量 self.agent_inputs 中,是因为现在是Sampler初始化过程,当开始连续收集数据的时候,会在第一批
        数据的基础上step下去,因此就把获取第一批数据的工作放到了这里。
        """
        agent_inputs, traj_infos = collector.start_envs(
            self.max_decorrelation_steps)
        collector.start_agent()

        self.agent = agent  # self.agent在本类中没有用,但尚未确定在外面使用SerialSampler的时候会不会使用
        self.samples_pyt = samples_pyt  # PyTorch数据格式(即底层是torch.Tensor)的samples
        self.samples_np = samples_np  # numpy数据格式(即底层是numpy array)的samples
        self.collector = collector  # sample收集器
        self.agent_inputs = agent_inputs
        self.traj_infos = traj_infos  # 一个list
        logger.log("Serial Sampler initialized.")
        return examples