Exemple #1
0
 def collect_evaluation(self, itr):
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     completed_traj_infos = list()
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     for b, o in enumerate(observations):
         observation[b] = o
     action = buffer_from_example(self.envs[0].action_space.null_value(),
                                  len(self.envs))
     reward = np.zeros(len(self.envs), dtype="float32")
     obs_pyt, act_pyt, rew_pyt = torchify_buffer(
         (observation, action, reward))
     self.agent.reset()
     self.agent.eval_mode(itr)
     for t in range(self.max_T):
         act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt)
         action = numpify_buffer(act_pyt)
         for b, env in enumerate(self.envs):
             o, r, d, env_info = env.step(action[b])
             traj_infos[b].step(observation[b], action[b], r, d,
                                agent_info[b], env_info)
             if getattr(env_info, "traj_done", d):
                 completed_traj_infos.append(traj_infos[b].terminate(o))
                 traj_infos[b] = self.TrajInfoCls()
                 o = env.reset()
             if ((type(d) is np.ndarray and d.any())
                     or (type(d) is bool and d)):
                 action[b] = 0  # Prev_action for next step.
                 r = 0
                 self.agent.reset_one(idx=b)
             observation[b] = o
             reward[b] = r
         if (self.max_trajectories is not None
                 and len(completed_traj_infos) >= self.max_trajectories):
             logger.log("Evaluation reached max num trajectories "
                        f"({self.max_trajectories}).")
             break
     if t == self.max_T - 1:
         logger.log("Evaluation reached max num time steps "
                    f"({self.max_T}).")
     return completed_traj_infos
Exemple #2
0
    def data_parallel(self):
        """Wraps the model with PyTorch's DistributedDataParallel.  The
        intention is for rlpyt to create a separate Python process to drive
        each GPU (or CPU-group for CPU-only, MPI-like configuration). Agents
        with additional model components (beyond ``self.model``) which will
        have gradients computed through them should extend this method to wrap
        those, as well.

        Typically called in the runner during startup.
        """
        if self.device.type == "cpu":
            self.model = DDPC(self.model)
            logger.log("Initialized DistributedDataParallelCPU agent model.")
        else:
            self.model = DDP(self.model,
                             device_ids=[self.device.index],
                             output_device=self.device.index)
            logger.log("Initialized DistributedDataParallel agent model on "
                       f"device {self.device}.")
Exemple #3
0
 def init_obs_norm(self, agent):
     """
     Initializes observation normalization parameters in intrinsic bonus model.
     Uses distinct environment for this purpose.
     """
     agent.set_norm_update(True)
     env = self.EnvCls(**self.env_kwargs)
     env.reset()
     logger.log(f"Sampler initializing bonus model observation normalization, steps: {self.obs_norm_steps}")
     for _ in range(self.obs_norm_steps):
         action = env.action_space.sample()
         obs, _, done, _ = env.step(action)
         obs = torch.from_numpy(obs).to(device=agent.device)
         # Prepare observation, flattening channel dim (frame-stack) into batch dim for image input
         if len(obs.shape) == 3:  # (C, H, W)
             obs = obs.view((-1, 1, *obs.shape[1:]))
         agent.bonus_model.normalize_obs(obs)
         if done:
             env.reset()
Exemple #4
0
 def validation(self, itr):
     logger.log("Computing validation loss...")
     val_info = ValInfo(*([] for _ in range(len(ValInfo._fields))))
     self.optimizer.zero_grad()
     for _ in range(self.n_validation_batches):
         samples = self.replay_buffer.sample_batch(self.validation_batch_B,
                                                   validation=True)
         with torch.no_grad():
             cpc_loss, cpc_accuracies, conv_output = self.cpc_loss(samples)
         val_info.cpcLoss.append(cpc_loss.item())
         val_info.cpcAccuracy1.append(cpc_accuracies[0].item())
         val_info.cpcAccuracy2.append(cpc_accuracies[1].item())
         val_info.cpcAccuracyTm1.append(cpc_accuracies[2].item())
         val_info.cpcAccuracyTm2.append(cpc_accuracies[3].item())
         val_info.convActivation.append(
             conv_output[0, 0].detach().cpu().view(-1).numpy())
     self.optimizer.zero_grad()
     logger.log("...validation loss completed.")
     return val_info
Exemple #5
0
    def data_parallel(self):
        """Wraps the model with PyTorch's DistributedDataParallel.  The
        intention is for rlpyt to create a separate Python process to drive
        each GPU (or CPU-group for CPU-only, MPI-like configuration). Agents
        with additional model components (beyond ``self.model``) which will
        have gradients computed through them should extend this method to wrap
        those, as well.

        Typically called in the runner during startup.
        """
        device_id = self.device.index  # None if cpu, else cuda index.
        self.model = DDP(
            self.model,
            device_ids=None if device_id is None else [device_id],  # 1 GPU.
            output_device=device_id,
        )
        logger.log("Initialized DistributedDataParallel agent model on "
                   f"device {self.device}.")
        return device_id
Exemple #6
0
 def __init__(self, example, **kwargs):
     field_names = [f for f in example._fields if f != "observation"]
     global BufferSamples
     BufferSamples = namedarraytuple("BufferSamples", field_names)
     buffer_example = BufferSamples(*(v for k, v in example.items()
                                      if k != "observation"))
     super().__init__(example=buffer_example, **kwargs)
     # Equivalent to image.shape[0] if observation is image array (C,H,W):
     self.n_frames = n_frames = get_leading_dims(example.observation,
                                                 n_dim=1)[0]
     logger.log(f"Frame-based buffer using {n_frames}-frame sequences.")
     # frames: oldest stored at t; duplicate n_frames - 1 beginning & end.
     self.samples_frames = buffer_from_example(
         example.observation[0], (self.T + n_frames - 1, self.B),
         share_memory=self.async_)  # [T+n_frames-1,B,H,W]
     # new_frames: shifted so newest stored at t; no duplication.
     self.samples_new_frames = self.samples_frames[n_frames -
                                                   1:]  # [T,B,H,W]
     self.off_forward = max(self.off_forward, n_frames - 1)
Exemple #7
0
 def initialize_replay_buffer(self, examples, batch_spec, async_=False):
     example_to_buffer = self.examples_to_buffer(examples)
     replay_kwargs = dict(
         example=example_to_buffer,
         size=self.replay_size,
         B=batch_spec.B,
         n_step_return=self.n_step_return,
     )
     if not self.bootstrap_timelimit:
         ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer
     else:
         ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer
     if self.ReplayBufferCls is not None:
         ReplayCls = self.ReplayBufferCls
         logger.log(
             f"WARNING: ignoring internal selection logic and using"
             f" input replay buffer class: {ReplayCls} -- compatibility not"
             " guaranteed.")
     self.replay_buffer = ReplayCls(**replay_kwargs)
Exemple #8
0
 def get_n_itr(self):
     """
     Determine number of train loop iterations to run.  Converts logging
     interval units from environment steps to iterations.
     """
     # Log at least as often as requested (round down itrs):
     log_interval_itrs = max(self.log_interval_steps // self.itr_batch_size,
                             1)
     n_itr = self.n_steps // self.itr_batch_size
     if n_itr % log_interval_itrs > 0:  # Keep going to next log itr.
         n_itr += log_interval_itrs - (n_itr % log_interval_itrs)
     self.log_interval_itrs = log_interval_itrs
     self.n_itr = n_itr
     # If we're transferring by timestep instead of iteration, round up to next iteration
     if self.transfer_timestep:
         self.transfer_iteration = int(
             -(-self.n_steps // self.itr_batch_size))  # Ceiling divide
     logger.log(f"Running {n_itr} iterations of minibatch RL.")
     return n_itr
Exemple #9
0
 def validation(self, itr):
     logger.log("Computing validation loss...")
     val_info = ValInfo(*([] for _ in range(len(ValInfo._fields))))
     self.optimizer.zero_grad()
     for _ in range(self.n_validation_batches):
         samples = self.replay_buffer.sample_batch(self.batch_size,
                                                   validation=True)
         with torch.no_grad():
             inv_loss, ent_loss, accuracy, perplexity, conv_output = self.inverse_loss(
                 samples)
         val_info.invLoss.append(inv_loss.item())
         val_info.entLoss.append(ent_loss.item())
         val_info.accuracy.append(accuracy.item())
         val_info.perplexity.append(perplexity.item())
         val_info.convActivation.append(conv_output[0].detach().cpu().view(
             -1).numpy())  # Keep 1 full one.
     self.optimizer.zero_grad()
     logger.log("...validation loss completed.")
     return val_info
Exemple #10
0
    def log_diagnostics(self, itr, traj_infos=None, eval_time=0):
        if itr > 0:
            self.pbar.stop()
        if itr >= self.min_itr_learn - 1:
            self.save_itr_snapshot(itr)
        new_time = time.time()
        self._cum_time = new_time - self._start_time
        train_time_elapsed = new_time - self._last_time - eval_time
        new_updates = self.algo.update_counter - self._last_update_counter
        new_samples = (self.sampler.batch_size * self.world_size *
                       self.log_interval_itrs)
        updates_per_second = (float('nan') if itr == 0 else new_updates /
                              train_time_elapsed)
        samples_per_second = (float('nan') if itr == 0 else new_samples /
                              train_time_elapsed)
        replay_ratio = (new_updates * self.algo.batch_size * self.world_size /
                        new_samples)
        cum_replay_ratio = (self.algo.batch_size * self.algo.update_counter /
                            ((itr + 1) * self.sampler.batch_size)
                            )  # world_size cancels.
        cum_steps = (itr + 1) * self.sampler.batch_size * self.world_size

        if self._eval:
            logger.record_tabular(
                f'CumuTrainTime', self._cum_time -
                self._cum_eval_time)  # Already added new eval_time.
        logger.record_tabular('Iteration', itr)
        logger.record_tabular('CumuTime (s)', self._cum_time)
        logger.record_tabular('CumuSteps', cum_steps)
        logger.record_tabular('CumuCompletedTrajs', self._cum_completed_trajs)
        logger.record_tabular('CumuUpdates', self.algo.update_counter)
        logger.record_tabular('StepsPerSecond', samples_per_second)
        logger.record_tabular('UpdatesPerSecond', updates_per_second)
        logger.record_tabular('ReplayRatio', replay_ratio)
        logger.record_tabular('CumuReplayRatio', cum_replay_ratio)
        self._log_infos(traj_infos)
        logger.dump_tabular(with_prefix=False)

        self._last_time = new_time
        self._last_update_counter = self.algo.update_counter
        if itr < self.n_itr - 1:
            logger.log(f"Optimizing over {self.log_interval_itrs} iterations.")
            self.pbar = ProgBarCounter(self.log_interval_itrs)
Exemple #11
0
    def _get_n_envs_list(self, affinity=None, n_worker=None, B=None):

        B = self.batch_spec.B if B is None else B
        n_worker = len(
            affinity["workers_cpus"]) if n_worker is None else n_worker
        if B < n_worker:
            logger.log(
                f"WARNING: requested fewer envs ({B}) than available worker "
                f"processes ({n_worker}). Using fewer workers (but maybe better to "
                "increase sampler's `batch_B`.")
            n_worker = B
        n_envs_list = [B // n_worker] * n_worker
        if not B % n_worker == 0:
            logger.log("WARNING: unequal number of envs per process, from "
                       f"batch_B {self.batch_spec.B} and n_worker {n_worker} "
                       "(possible suboptimal speed).")
            for b in range(B % n_worker):
                n_envs_list[b] += 1
        return n_envs_list
Exemple #12
0
def buffer_concatenate(buffers, axis=0):
    assert type(buffers) == tuple
    if isinstance(buffers[0], np.ndarray):
        try:
            return np.concatenate(buffers, axis=axis)
        except ValueError:
            logger.log("Had a ValueError in buffer concat, probably action dimensions that don't line up, populating with zeros.")
            logger.log(f"buffer shapes: {[buf.shape for buf in buffers]}")
            return np.zeros((buffers[0].shape[0], sum(buf.shape[1] for buf in buffers)))
    fields = buffers[0]._fields
    for buf in buffers:
        # try to make sure they're the same structure
        assert buf._fields == fields
    new_buf = buffers[0]
    fields = new_buf._fields
    new_buf = new_buf._make(tuple(
        buffer_concatenate(tuple(getattr(buf, field) for buf in buffers), axis=1)
            for field in fields))
    return new_buf
Exemple #13
0
 def optim_startup(self):
     main_affinity = self.affinity.optimizer[0]
     p = psutil.Process()
     if main_affinity.get("set_affinity", True):
         p.cpu_affinity(main_affinity["cpus"])
     logger.log(f"Optimizer master CPU affinity: {p.cpu_affinity()}.")
     torch.set_num_threads(main_affinity["torch_threads"])
     logger.log(f"Optimizer master Torch threads: {torch.get_num_threads()}.")
     self.agent.to_device(main_affinity.get("cuda_idx", None))
     if self.world_size > 1:
         self.agent.data_parallel()
     self.algo.optim_initialize(rank=0)
     throttle_itr = 1 + getattr(self.algo,
         "min_steps_learn", 0) // self.sampler_batch_size
     delta_throttle_itr = (self.algo.batch_size * self.world_size *
         self.algo.updates_per_optimize /  # (is updates_per_sync)
         (self.sampler_batch_size * self.algo.replay_ratio))
     self.initialize_logging()
     return throttle_itr, delta_throttle_itr
Exemple #14
0
 def log_diagnostics(self,
                     itr,
                     eval_traj_infos,
                     eval_time,
                     save_cur=False,
                     prefix='Diagnostics/'):
     if not eval_traj_infos:
         logger.log("WARNING: had no complete trajectories in eval.")
     steps_in_eval = sum([info["Length"] for info in eval_traj_infos])
     with logger.tabular_prefix(prefix):
         logger.record_tabular('StepsInEval', steps_in_eval)
         logger.record_tabular('TrajsInEval', len(eval_traj_infos))
         self._cum_eval_time += eval_time
         logger.record_tabular('CumEvalTime', self._cum_eval_time)
     super().log_diagnostics(itr,
                             eval_traj_infos,
                             eval_time,
                             save_cur,
                             prefix=prefix)
Exemple #15
0
 def startup(self):
     p = psutil.Process()
     p.cpu_affinity(self.affinity["cpus"])
     logger.log("Optimizer master CPU affinity: {p.cpu_affinity()}.")
     torch.set_num_threads(self.affinity["torch_threads"])
     logger.log("Optimizer master Torch threads: {torch.get_num_threads()}.")
     set_seed(self.seed)
     self.agent.initialize_cuda(
         cuda_idx=self.affinity.get("cuda_idx", None),
         dpp=self.n_runner > 1,
     )
     self.algo.initialize_async(agent=self.agent,
         updates_per_sync=self.updates_per_sync)
     throttle_itr = 1 + self.algo.min_steps_learn // self.itr_batch_size
     delta_throttle_itr = (self.algo.batch_size * self.n_runner *
         self.algo.updates_per_optimize /  # (is updates_per_sync)
         (self.itr_batch_size * self.training_ratio))
     self.initilaize_logging()
     return throttle_itr, delta_throttle_itr
Exemple #16
0
    def initialize(self, agent, affinity=None, seed=None,
            bootstrap_value=False, traj_info_kwargs=None):
        envs = [self.EnvCls(**self.env_kwargs) for _ in range(self.batch_spec.B)]
        agent.initialize(envs[0].spaces, share_memory=False)
        samples_pyt, samples_np, examples = build_samples_buffer(agent, envs[0],
            self.batch_spec, bootstrap_value, agent_shared=False,
            env_shared=False, subprocess=False)
        if traj_info_kwargs:
            for k, v in traj_info_kwargs.items():
                setattr(self.TrajInfoCls, "_" + k, v)  # Avoid passing at init.
        collector = self.CollectorCls(
            rank=0,
            envs=envs,
            samples_np=samples_np,
            batch_T=self.batch_spec.T,
            TrajInfoCls=self.TrajInfoCls,
            agent=agent,
        )
        if self.eval_n_envs > 0:  # May do evaluation.
            eval_envs = [self.EnvCls(**self.eval_env_kwargs)
                for _ in range(self.eval_n_envs)]
            eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector
            self.eval_collector = eval_CollectorCls(
                envs=eval_envs,
                agent=agent,
                TrajInfoCls=self.TrajInfoCls,
                max_T=self.eval_max_steps // self.eval_n_envs,
                max_trajectories=self.eval_max_trajectories,
            )

        agent_inputs, traj_infos = collector.start_envs(
            self.max_decorrelation_steps)
        collector.start_agent()

        self.agent = agent
        self.samples_pyt = samples_pyt
        self.samples_np = samples_np
        self.collector = collector
        self.agent_inputs = agent_inputs
        self.traj_infos = traj_infos
        logger.log("Serial Sampler initialized.")
        return examples
Exemple #17
0
    def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples):
        if agent.recurrent:
            raise NotImplementedError
        self.agent = agent
        self.n_itr = n_itr
        self.mid_batch_reset = mid_batch_reset
        self.mu_optimizer = self.OptimCls(agent.mu_parameters(),
                                          lr=self.mu_learning_rate,
                                          **self.optim_kwargs)
        self.q_optimizer = self.OptimCls(agent.q_parameters(),
                                         lr=self.q_learning_rate,
                                         **self.optim_kwargs)
        if self.initial_optim_state_dict is not None:
            self.q_optimizer.load_state_dict(
                self.initial_optim_state_dict["q"])
            self.mu_optimizer.load_state_dict(
                self.initial_optim_state_dict["mu"])

        sample_bs = batch_spec.size
        train_bs = self.batch_size
        assert (self.training_ratio * sample_bs) % train_bs == 0
        self.updates_per_optimize = int(
            (self.training_ratio * sample_bs) // train_bs)
        logger.log(
            f"From sampler batch size {sample_bs}, training "
            f"batch size {train_bs}, and training ratio "
            f"{self.training_ratio}, computed {self.updates_per_optimize} "
            f"updates per iteration.")
        self.min_itr_learn = self.min_steps_learn // sample_bs
        self.agent.give_min_itr_learn(self.min_itr_learn)

        example_to_buffer = SamplesToBuffer(
            observation=examples["observation"],
            action=examples["action"],
            reward=examples["reward"],
            done=examples["done"],
        )
        replay_kwargs = dict(example=example_to_buffer,
                             size=self.replay_size,
                             B=batch_spec.B,
                             n_step_return=self.n_step_return)
        self.replay_buffer = UniformReplayBuffer(**replay_kwargs)
Exemple #18
0
 def initialize(
     self,
     agent,
     affinity=None,
     seed=None,
     bootstrap_value=False,
     traj_info_kwargs=None,
     rank=0,
     world_size=1,
 ):
     """Should instantiate all components, including setup of parallel
     process if applicable."""
     B = self.batch_spec.B
     global_B = B * world_size
     env_ranks = list(range(rank * B, (rank + 1) * B))
     agent.initialize(self.env.spaces,
                      share_memory=False,
                      global_B=global_B,
                      env_ranks=env_ranks)
     self.env.seed(seed)
     examples = dict()
     get_example_outputs_single(agent, self.env, examples, subprocess=False)
     samples_pyt, samples_np, examples = build_samples_buffer(
         agent,
         self.env,
         self.batch_spec,
         bootstrap_value,
         agent_shared=False,
         env_shared=False,
         subprocess=False,
         examples=examples)
     self.samples_pyt = samples_pyt
     self.samples_np = samples_np
     if traj_info_kwargs:
         for k, v in traj_info_kwargs.items():
             setattr(self.TrajInfoCls, "_" + k, v)  # Avoid passing at init.
             setattr(self.ReturnTrajInfoCls, "_" + k, v)
     self.agent_inputs, self.traj_infos = self._decorrelate_envs()
     # Collector calls start_agent here, but doesn't apply
     self.agent = agent
     logger.log("Pomdp Sampler initialized.")
     return examples
Exemple #19
0
    def evaluate_agent(self, itr):
        """
        Record offline evaluation of agent performance, by ``sampler.evaluate_agent()``.
        """
        if itr > 0:
            self.pbar.stop()

        if itr >= self.min_itr_learn - 1 or itr == 0:
            logger.log("Evaluating agent...")
            self.agent.eval_mode(itr)  # Might be agent in sampler.
            eval_time = -time.time()
            player_traj_infos, observer_traj_infos = self.sampler.evaluate_agent(
                itr)
            eval_time += time.time()
        else:
            player_traj_infos = []
            observer_traj_infos = []
            eval_time = 0.0
        logger.log("Evaluation runs complete.")
        return player_traj_infos, observer_traj_infos, eval_time
Exemple #20
0
 def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples,
         world_size=1, rank=0):
     """Stores input arguments and initializes replay buffer and optimizer.
     Use in non-async runners.  Computes number of gradient updates per
     optimization iteration as `(replay_ratio * sampler-batch-size /
     training-batch_size)`."""
     self.agent = agent
     self.n_itr = n_itr
     self.mid_batch_reset = mid_batch_reset
     self.sampler_bs = sampler_bs = batch_spec.size
     self.updates_per_optimize = int(self.replay_ratio * sampler_bs /
         self.batch_size)
     logger.log(f"From sampler batch size {sampler_bs}, training "
         f"batch size {self.batch_size}, and replay ratio "
         f"{self.replay_ratio}, computed {self.updates_per_optimize} "
         f"updates per iteration.")
     self.min_itr_learn = self.min_steps_learn // sampler_bs
     agent.give_min_itr_learn(self.min_itr_learn)
     self.initialize_replay_buffer(examples, batch_spec)
     self.optim_initialize(rank)
    def initialize(
        self,
        agent,
        affinity=None,
        seed=None,
        bootstrap_value=False,
        traj_info_kwargs=None,
        rank=0,
        world_size=1,
    ):
        assert world_size == 1  # world size used in async samplers, not relevant for this class

        T, B = self.batch_spec
        self.agent = agent
        self.env = self.EnvCls(batch_T=T, batch_B=B, **self.env_kwargs)
        env_ranks = list(range(rank * B, (rank + 1) * B))
        agent.initialize(self.env.spaces,
                         share_memory=False,
                         global_B=B,
                         env_ranks=env_ranks)
        self.samples_pyt, self.samples_np, examples = build_samples_buffer(
            agent,
            self.env,
            self.batch_spec,
            bootstrap_value,
            agent_shared=False,
            env_shared=False,
            subprocess=False,
            examples=self._get_example_outputs())

        self.samples_np.env.done[:-1, :] = False
        self.samples_np.env.done[-1, :] = True
        self.traj_info_kwargs = traj_info_kwargs

        self.agent_inputs = AgentInputs(
            buffer_from_example(examples["observation"], (B, )),
            buffer_from_example(examples["action"], (B, )),
            buffer_from_example(examples["reward"], (B, )))
        self._start_agent(B, env_ranks)
        logger.log("BatchedEpisodicSampler initialized.")
        return examples
    def initialize(self, affinity):
        p = psutil.Process()
        if affinity.get("set_affinity", True):
            p.cpu_affinity(affinity["master_cpus"])
        # torch.set_num_threads(affinity["master_torch_threads"])
        torch.set_num_threads(1)  # Needed to prevent MKL hang :( .
        B = self.batch_spec.B
        envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)]
        sync = AttrDict(db_idx=AttrDict(value=0))  # Mimic the mp.RawValue format.
        collector = self.CollectorCls(
            rank=0,
            envs=envs,
            samples_np=self.double_buffer,
            batch_T=self.batch_spec.T,
            TrajInfoCls=self.TrajInfoCls,
            agent=self.agent,
            sync=sync,
        )
        if self.eval_n_envs > 0:
            eval_envs = [self.EnvCls(**self.eval_env_kwargs)
                for _ in range(self.eval_n_envs)]
            eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector
            self.eval_collector = eval_CollectorCls(
                envs=eval_envs,
                agent=self.agent,
                TrajInfoCls=self.TrajInfoCls,
                max_T=self.eval_max_steps // self.eval_n_envs,
                max_trajectories=self.eval_max_trajectories,
            )
        self.agent.to_device(cuda_idx=affinity.get("cuda_idx", None))
        self.agent.async_cpu(share_memory=False)

        agent_inputs, traj_infos = collector.start_envs(
            self.max_decorrelation_steps)
        collector.start_agent()

        self.collector = collector
        self.agent_inputs = agent_inputs
        self.traj_infos = traj_infos
        self.sync = sync
        logger.log("Serial sampler initialized.")
Exemple #23
0
 def start_envs(self, max_decorrelation_steps=0):
     """Calls ``reset()`` on every environment instance, then steps each
     one through a random number of random actions, and returns the
     resulting agent_inputs buffer (`observation`, `prev_action`,
     `prev_reward`)."""
     traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))]
     observations = list()
     for env in self.envs:
         observations.append(env.reset())
     observation = buffer_from_example(observations[0], len(self.envs))
     for b, obs in enumerate(observations):
         observation[b] = obs  # numpy array or namedarraytuple
     prev_action = np.stack([env.action_space.null_value()
         for env in self.envs])
     prev_reward = np.zeros(len(self.envs), dtype="float32")
     if self.rank == 0:
         logger.log("Sampler decorrelating envs, max steps: "
             f"{max_decorrelation_steps}")
     if max_decorrelation_steps != 0:
         for b, env in enumerate(self.envs):
             n_steps = 1 + int(np.random.rand() * max_decorrelation_steps)
             for _ in range(n_steps):
                 a = env.action_space.sample()
                 o, r, d, info = env.step(a)
                 traj_infos[b].step(o, a, r, d, None, info)
                 if getattr(info, "traj_done", d):
                     o = env.reset()
                     traj_infos[b] = self.TrajInfoCls()
                 if ((type(d) is np.ndarray
                     and d.any()) or (type(d) is bool and d)):
                     a = env.action_space.null_value()
                     r = 0
             observation[b] = o
             prev_action[b] = a
             prev_reward[b] = r
     # For action-server samplers.
     if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None:
         self.step_buffer_np.observation[:] = observation
         self.step_buffer_np.action[:] = prev_action
         self.step_buffer_np.reward[:] = prev_reward
     return AgentInputs(observation, prev_action, prev_reward), traj_infos
Exemple #24
0
    def serve_actions_evaluation(self, itr):
        obs_ready, act_ready = self.sync.obs_ready, self.sync.act_ready
        step_np, step_pyt = self.eval_step_buffer_np, self.eval_step_buffer_pyt
        traj_infos = list()
        self.agent.reset()
        agent_inputs = AgentInputs(step_pyt.observation, step_pyt.action,
            step_pyt.reward)  # Fixed buffer objects.

        for t in range(self.eval_max_T):
            if t % EVAL_TRAJ_CHECK == 0:  # (While workers stepping.)
                traj_infos.extend(drain_queue(self.eval_traj_infos_queue,
                    guard_sentinel=True))
            for b in obs_ready:
                b.acquire()
                # assert not b.acquire(block=False)  # Debug check.
            for b_reset in np.where(step_np.done)[0]:
                step_np.action[b_reset] = 0  # Null prev_action.
                step_np.reward[b_reset] = 0  # Null prev_reward.
                self.agent.reset_one(idx=b_reset)
            action, agent_info = self.agent.step(*agent_inputs)
            step_np.action[:] = action
            step_np.agent_info[:] = agent_info
            if self.eval_max_trajectories is not None and t % EVAL_TRAJ_CHECK == 0:
                self.sync.stop_eval.value = len(traj_infos) >= self.eval_max_trajectories
            for w in act_ready:
                # assert not w.acquire(block=False)  # Debug check.
                w.release()
            if self.sync.stop_eval.value:
                logger.log("Evaluation reach max num trajectories "
                    f"({self.eval_max_trajectories}).")
                break
        if t == self.eval_max_T - 1 and self.eval_max_trajectories is not None:
            logger.log("Evaluation reached max num time steps "
                f"({self.eval_max_T}).")
        for b in obs_ready:
            b.acquire()  # Workers always do extra release; drain it.
            assert not b.acquire(block=False)  # Debug check.
        for w in act_ready:
            assert not w.acquire(block=False)  # Debug check.

        return traj_infos
Exemple #25
0
 def log_diagnostics(self, itr, val_info, *args, **kwargs):
     self.save_itr_snapshot(itr)
     new_time = time.time()
     self._cum_time = new_time - self._start_time
     epochs = itr * self.algo.batch_size / (
         self.algo.replay_buffer.size * (1 - self.algo.validation_split))
     logger.record_tabular("Iteration", itr)
     logger.record_tabular("Epochs", epochs)
     logger.record_tabular("CumTime (s)", self._cum_time)
     logger.record_tabular("UpdatesPerSecond", itr / self._cum_time)
     if self._opt_infos:
         for k, v in self._opt_infos.items():
             logger.record_tabular_misc_stat(k, v)
     for k, v in zip(val_info._fields, val_info):
         logger.record_tabular_misc_stat("val_" + k, v)
     self._opt_infos = {k: list() for k in self._opt_infos}  # (reset)
     logger.dump_tabular(with_prefix=False)
     if itr < self.n_updates - 1:
         logger.log(
             f"Optimizing over {self.log_interval_updates} iterations.")
         self.pbar = ProgBarCounter(self.log_interval_updates)
Exemple #26
0
def initialize_worker(rank, seed=None, cpu=None, torch_threads=None):
    log_str = f"Sampler rank {rank} initialized"
    cpu = [cpu] if isinstance(cpu, int) else cpu
    p = psutil.Process()
    try:
        if cpu is not None:
            p.cpu_affinity(cpu)
        cpu_affin = p.cpu_affinity()
    except AttributeError:
        cpu_affin = "UNAVAILABLE MacOS"
    log_str += f", CPU affinity {cpu_affin}"
    torch_threads = (1 if torch_threads is None and cpu is not None else
        torch_threads)  # Default to 1 to avoid possible MKL hang.
    if torch_threads is not None:
        torch.set_num_threads(torch_threads)
    log_str += f", Torch threads {torch.get_num_threads()}"
    if seed is not None:
        set_seed(seed)
        time.sleep(0.3)  # (so the printing from set_seed is not intermixed)
        log_str += f", Seed {seed}"
    logger.log(log_str)
Exemple #27
0
    def evaluate_agent(self, itr):
        """
        评估模型。

        :param itr: 第几次迭代。
        :return: 一个tuple,包含trajectory的信息以及evaluation所消耗的时间。
        """
        if itr > 0:
            self.pbar.stop()  # 停止进度条

        if itr >= self.min_itr_learn - 1 or itr == 0:
            logger.log("Evaluating agent...")
            self.agent.eval_mode(itr)  # Might be agent in sampler.
            eval_time = -time.time()
            traj_infos = self.sampler.evaluate_agent(itr)  # 真正开始做evaluation的地方
            eval_time += time.time()  # 经过这么一计算,现在eval_time变成了上一条语句的执行消耗时间
        else:
            traj_infos = []
            eval_time = 0.0
        logger.log("Evaluation runs complete.")
        return traj_infos, eval_time
Exemple #28
0
    def data_parallel(self):
        """
        Wraps the intrinsic bonus model with PyTorch's DistributedDataParallel.  The
        intention is for rlpyt to create a separate Python process to drive
        each GPU (or CPU-group for CPU-only, MPI-like configuration).

        Typically called in the runner during startup.
        """
        super().data_parallel()
        if self.device.type == "cpu":
            self.bonus_model = DDPC(self.bonus_model)
            logger.log(
                "Initialized DistributedDataParallelCPU intrinsic bonus model."
            )
        else:
            self.bonus_model = DDP(self.bonus_model,
                                   device_ids=[self.device.index],
                                   output_device=self.device.index)
            logger.log(
                f"Initialized DistributedDataParallel intrinsic bonus model on device {self.device}."
            )
Exemple #29
0
 def shutdown(self):
     self.pbar.stop()
     logger.log("Master optimizer shutting down, joining sampler process...")
     self.sampler_proc.join()
     logger.log("Joining memory copiers...")
     for p in self.memcpy_procs:
         p.join()
     if self.ctrl.opt_throttle is not None:
         logger.log("Joining optimizer processes...")
         self.ctrl.quit_opt.value = True
         self.ctrl.opt_throttle.wait()
         for p in self.optimizer_procs:
             p.join()
     logger.log("All processes shutdown.  Training complete.")
Exemple #30
0
    def __init__(
        self,
        image_shape,
        action_size,
        hidden_sizes=512,
        stop_conv_grad=False,
        channels=None,  # Defaults below.
        kernel_sizes=None,
        strides=None,
        paddings=None,
        kiaming_init=True,
        normalize_conv_out=False,
    ):
        super().__init__()
        c, h, w = image_shape
        self.conv = Conv2dModel(
            in_channels=c,
            channels=channels or [32, 64, 64],
            kernel_sizes=kernel_sizes or [8, 4, 3],
            strides=strides or [4, 2, 1],
            paddings=paddings,
        )
        self._conv_out_size = self.conv.conv_out_size(h=h, w=w)
        self.pi_v_mlp = MlpModel(
            input_size=self._conv_out_size,
            hidden_sizes=hidden_sizes,
            output_size=action_size + 1,
        )
        if kiaming_init:
            self.apply(weight_init)

        self.stop_conv_grad = stop_conv_grad
        logger.log("Model stopping gradient at CONV." if stop_conv_grad else
                   "Modeul using gradients on all parameters.")
        if normalize_conv_out:
            # Havent' seen this make a difference yet.
            logger.log("Model normalizing conv output across all pixels.")
            self.conv_rms = RunningMeanStdModel((1, ))
            self.var_clip = 1e-6
        self.normalize_conv_out = normalize_conv_out