def collect_evaluation(self, itr): traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] completed_traj_infos = list() observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_traj_infos.append(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() o = env.reset() if ((type(d) is np.ndarray and d.any()) or (type(d) is bool and d)): action[b] = 0 # Prev_action for next step. r = 0 self.agent.reset_one(idx=b) observation[b] = o reward[b] = r if (self.max_trajectories is not None and len(completed_traj_infos) >= self.max_trajectories): logger.log("Evaluation reached max num trajectories " f"({self.max_trajectories}).") break if t == self.max_T - 1: logger.log("Evaluation reached max num time steps " f"({self.max_T}).") return completed_traj_infos
def data_parallel(self): """Wraps the model with PyTorch's DistributedDataParallel. The intention is for rlpyt to create a separate Python process to drive each GPU (or CPU-group for CPU-only, MPI-like configuration). Agents with additional model components (beyond ``self.model``) which will have gradients computed through them should extend this method to wrap those, as well. Typically called in the runner during startup. """ if self.device.type == "cpu": self.model = DDPC(self.model) logger.log("Initialized DistributedDataParallelCPU agent model.") else: self.model = DDP(self.model, device_ids=[self.device.index], output_device=self.device.index) logger.log("Initialized DistributedDataParallel agent model on " f"device {self.device}.")
def init_obs_norm(self, agent): """ Initializes observation normalization parameters in intrinsic bonus model. Uses distinct environment for this purpose. """ agent.set_norm_update(True) env = self.EnvCls(**self.env_kwargs) env.reset() logger.log(f"Sampler initializing bonus model observation normalization, steps: {self.obs_norm_steps}") for _ in range(self.obs_norm_steps): action = env.action_space.sample() obs, _, done, _ = env.step(action) obs = torch.from_numpy(obs).to(device=agent.device) # Prepare observation, flattening channel dim (frame-stack) into batch dim for image input if len(obs.shape) == 3: # (C, H, W) obs = obs.view((-1, 1, *obs.shape[1:])) agent.bonus_model.normalize_obs(obs) if done: env.reset()
def validation(self, itr): logger.log("Computing validation loss...") val_info = ValInfo(*([] for _ in range(len(ValInfo._fields)))) self.optimizer.zero_grad() for _ in range(self.n_validation_batches): samples = self.replay_buffer.sample_batch(self.validation_batch_B, validation=True) with torch.no_grad(): cpc_loss, cpc_accuracies, conv_output = self.cpc_loss(samples) val_info.cpcLoss.append(cpc_loss.item()) val_info.cpcAccuracy1.append(cpc_accuracies[0].item()) val_info.cpcAccuracy2.append(cpc_accuracies[1].item()) val_info.cpcAccuracyTm1.append(cpc_accuracies[2].item()) val_info.cpcAccuracyTm2.append(cpc_accuracies[3].item()) val_info.convActivation.append( conv_output[0, 0].detach().cpu().view(-1).numpy()) self.optimizer.zero_grad() logger.log("...validation loss completed.") return val_info
def data_parallel(self): """Wraps the model with PyTorch's DistributedDataParallel. The intention is for rlpyt to create a separate Python process to drive each GPU (or CPU-group for CPU-only, MPI-like configuration). Agents with additional model components (beyond ``self.model``) which will have gradients computed through them should extend this method to wrap those, as well. Typically called in the runner during startup. """ device_id = self.device.index # None if cpu, else cuda index. self.model = DDP( self.model, device_ids=None if device_id is None else [device_id], # 1 GPU. output_device=device_id, ) logger.log("Initialized DistributedDataParallel agent model on " f"device {self.device}.") return device_id
def __init__(self, example, **kwargs): field_names = [f for f in example._fields if f != "observation"] global BufferSamples BufferSamples = namedarraytuple("BufferSamples", field_names) buffer_example = BufferSamples(*(v for k, v in example.items() if k != "observation")) super().__init__(example=buffer_example, **kwargs) # Equivalent to image.shape[0] if observation is image array (C,H,W): self.n_frames = n_frames = get_leading_dims(example.observation, n_dim=1)[0] logger.log(f"Frame-based buffer using {n_frames}-frame sequences.") # frames: oldest stored at t; duplicate n_frames - 1 beginning & end. self.samples_frames = buffer_from_example( example.observation[0], (self.T + n_frames - 1, self.B), share_memory=self.async_) # [T+n_frames-1,B,H,W] # new_frames: shifted so newest stored at t; no duplication. self.samples_new_frames = self.samples_frames[n_frames - 1:] # [T,B,H,W] self.off_forward = max(self.off_forward, n_frames - 1)
def initialize_replay_buffer(self, examples, batch_spec, async_=False): example_to_buffer = self.examples_to_buffer(examples) replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, n_step_return=self.n_step_return, ) if not self.bootstrap_timelimit: ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer else: ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer if self.ReplayBufferCls is not None: ReplayCls = self.ReplayBufferCls logger.log( f"WARNING: ignoring internal selection logic and using" f" input replay buffer class: {ReplayCls} -- compatibility not" " guaranteed.") self.replay_buffer = ReplayCls(**replay_kwargs)
def get_n_itr(self): """ Determine number of train loop iterations to run. Converts logging interval units from environment steps to iterations. """ # Log at least as often as requested (round down itrs): log_interval_itrs = max(self.log_interval_steps // self.itr_batch_size, 1) n_itr = self.n_steps // self.itr_batch_size if n_itr % log_interval_itrs > 0: # Keep going to next log itr. n_itr += log_interval_itrs - (n_itr % log_interval_itrs) self.log_interval_itrs = log_interval_itrs self.n_itr = n_itr # If we're transferring by timestep instead of iteration, round up to next iteration if self.transfer_timestep: self.transfer_iteration = int( -(-self.n_steps // self.itr_batch_size)) # Ceiling divide logger.log(f"Running {n_itr} iterations of minibatch RL.") return n_itr
def validation(self, itr): logger.log("Computing validation loss...") val_info = ValInfo(*([] for _ in range(len(ValInfo._fields)))) self.optimizer.zero_grad() for _ in range(self.n_validation_batches): samples = self.replay_buffer.sample_batch(self.batch_size, validation=True) with torch.no_grad(): inv_loss, ent_loss, accuracy, perplexity, conv_output = self.inverse_loss( samples) val_info.invLoss.append(inv_loss.item()) val_info.entLoss.append(ent_loss.item()) val_info.accuracy.append(accuracy.item()) val_info.perplexity.append(perplexity.item()) val_info.convActivation.append(conv_output[0].detach().cpu().view( -1).numpy()) # Keep 1 full one. self.optimizer.zero_grad() logger.log("...validation loss completed.") return val_info
def log_diagnostics(self, itr, traj_infos=None, eval_time=0): if itr > 0: self.pbar.stop() if itr >= self.min_itr_learn - 1: self.save_itr_snapshot(itr) new_time = time.time() self._cum_time = new_time - self._start_time train_time_elapsed = new_time - self._last_time - eval_time new_updates = self.algo.update_counter - self._last_update_counter new_samples = (self.sampler.batch_size * self.world_size * self.log_interval_itrs) updates_per_second = (float('nan') if itr == 0 else new_updates / train_time_elapsed) samples_per_second = (float('nan') if itr == 0 else new_samples / train_time_elapsed) replay_ratio = (new_updates * self.algo.batch_size * self.world_size / new_samples) cum_replay_ratio = (self.algo.batch_size * self.algo.update_counter / ((itr + 1) * self.sampler.batch_size) ) # world_size cancels. cum_steps = (itr + 1) * self.sampler.batch_size * self.world_size if self._eval: logger.record_tabular( f'CumuTrainTime', self._cum_time - self._cum_eval_time) # Already added new eval_time. logger.record_tabular('Iteration', itr) logger.record_tabular('CumuTime (s)', self._cum_time) logger.record_tabular('CumuSteps', cum_steps) logger.record_tabular('CumuCompletedTrajs', self._cum_completed_trajs) logger.record_tabular('CumuUpdates', self.algo.update_counter) logger.record_tabular('StepsPerSecond', samples_per_second) logger.record_tabular('UpdatesPerSecond', updates_per_second) logger.record_tabular('ReplayRatio', replay_ratio) logger.record_tabular('CumuReplayRatio', cum_replay_ratio) self._log_infos(traj_infos) logger.dump_tabular(with_prefix=False) self._last_time = new_time self._last_update_counter = self.algo.update_counter if itr < self.n_itr - 1: logger.log(f"Optimizing over {self.log_interval_itrs} iterations.") self.pbar = ProgBarCounter(self.log_interval_itrs)
def _get_n_envs_list(self, affinity=None, n_worker=None, B=None): B = self.batch_spec.B if B is None else B n_worker = len( affinity["workers_cpus"]) if n_worker is None else n_worker if B < n_worker: logger.log( f"WARNING: requested fewer envs ({B}) than available worker " f"processes ({n_worker}). Using fewer workers (but maybe better to " "increase sampler's `batch_B`.") n_worker = B n_envs_list = [B // n_worker] * n_worker if not B % n_worker == 0: logger.log("WARNING: unequal number of envs per process, from " f"batch_B {self.batch_spec.B} and n_worker {n_worker} " "(possible suboptimal speed).") for b in range(B % n_worker): n_envs_list[b] += 1 return n_envs_list
def buffer_concatenate(buffers, axis=0): assert type(buffers) == tuple if isinstance(buffers[0], np.ndarray): try: return np.concatenate(buffers, axis=axis) except ValueError: logger.log("Had a ValueError in buffer concat, probably action dimensions that don't line up, populating with zeros.") logger.log(f"buffer shapes: {[buf.shape for buf in buffers]}") return np.zeros((buffers[0].shape[0], sum(buf.shape[1] for buf in buffers))) fields = buffers[0]._fields for buf in buffers: # try to make sure they're the same structure assert buf._fields == fields new_buf = buffers[0] fields = new_buf._fields new_buf = new_buf._make(tuple( buffer_concatenate(tuple(getattr(buf, field) for buf in buffers), axis=1) for field in fields)) return new_buf
def optim_startup(self): main_affinity = self.affinity.optimizer[0] p = psutil.Process() if main_affinity.get("set_affinity", True): p.cpu_affinity(main_affinity["cpus"]) logger.log(f"Optimizer master CPU affinity: {p.cpu_affinity()}.") torch.set_num_threads(main_affinity["torch_threads"]) logger.log(f"Optimizer master Torch threads: {torch.get_num_threads()}.") self.agent.to_device(main_affinity.get("cuda_idx", None)) if self.world_size > 1: self.agent.data_parallel() self.algo.optim_initialize(rank=0) throttle_itr = 1 + getattr(self.algo, "min_steps_learn", 0) // self.sampler_batch_size delta_throttle_itr = (self.algo.batch_size * self.world_size * self.algo.updates_per_optimize / # (is updates_per_sync) (self.sampler_batch_size * self.algo.replay_ratio)) self.initialize_logging() return throttle_itr, delta_throttle_itr
def log_diagnostics(self, itr, eval_traj_infos, eval_time, save_cur=False, prefix='Diagnostics/'): if not eval_traj_infos: logger.log("WARNING: had no complete trajectories in eval.") steps_in_eval = sum([info["Length"] for info in eval_traj_infos]) with logger.tabular_prefix(prefix): logger.record_tabular('StepsInEval', steps_in_eval) logger.record_tabular('TrajsInEval', len(eval_traj_infos)) self._cum_eval_time += eval_time logger.record_tabular('CumEvalTime', self._cum_eval_time) super().log_diagnostics(itr, eval_traj_infos, eval_time, save_cur, prefix=prefix)
def startup(self): p = psutil.Process() p.cpu_affinity(self.affinity["cpus"]) logger.log("Optimizer master CPU affinity: {p.cpu_affinity()}.") torch.set_num_threads(self.affinity["torch_threads"]) logger.log("Optimizer master Torch threads: {torch.get_num_threads()}.") set_seed(self.seed) self.agent.initialize_cuda( cuda_idx=self.affinity.get("cuda_idx", None), dpp=self.n_runner > 1, ) self.algo.initialize_async(agent=self.agent, updates_per_sync=self.updates_per_sync) throttle_itr = 1 + self.algo.min_steps_learn // self.itr_batch_size delta_throttle_itr = (self.algo.batch_size * self.n_runner * self.algo.updates_per_optimize / # (is updates_per_sync) (self.itr_batch_size * self.training_ratio)) self.initilaize_logging() return throttle_itr, delta_throttle_itr
def initialize(self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None): envs = [self.EnvCls(**self.env_kwargs) for _ in range(self.batch_spec.B)] agent.initialize(envs[0].spaces, share_memory=False) samples_pyt, samples_np, examples = build_samples_buffer(agent, envs[0], self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False) if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) # Avoid passing at init. collector = self.CollectorCls( rank=0, envs=envs, samples_np=samples_np, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=agent, ) if self.eval_n_envs > 0: # May do evaluation. eval_envs = [self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs)] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.agent = agent self.samples_pyt = samples_pyt self.samples_np = samples_np self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos logger.log("Serial Sampler initialized.") return examples
def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples): if agent.recurrent: raise NotImplementedError self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.mu_optimizer = self.OptimCls(agent.mu_parameters(), lr=self.mu_learning_rate, **self.optim_kwargs) self.q_optimizer = self.OptimCls(agent.q_parameters(), lr=self.q_learning_rate, **self.optim_kwargs) if self.initial_optim_state_dict is not None: self.q_optimizer.load_state_dict( self.initial_optim_state_dict["q"]) self.mu_optimizer.load_state_dict( self.initial_optim_state_dict["mu"]) sample_bs = batch_spec.size train_bs = self.batch_size assert (self.training_ratio * sample_bs) % train_bs == 0 self.updates_per_optimize = int( (self.training_ratio * sample_bs) // train_bs) logger.log( f"From sampler batch size {sample_bs}, training " f"batch size {train_bs}, and training ratio " f"{self.training_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = self.min_steps_learn // sample_bs self.agent.give_min_itr_learn(self.min_itr_learn) example_to_buffer = SamplesToBuffer( observation=examples["observation"], action=examples["action"], reward=examples["reward"], done=examples["done"], ) replay_kwargs = dict(example=example_to_buffer, size=self.replay_size, B=batch_spec.B, n_step_return=self.n_step_return) self.replay_buffer = UniformReplayBuffer(**replay_kwargs)
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): """Should instantiate all components, including setup of parallel process if applicable.""" B = self.batch_spec.B global_B = B * world_size env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(self.env.spaces, share_memory=False, global_B=global_B, env_ranks=env_ranks) self.env.seed(seed) examples = dict() get_example_outputs_single(agent, self.env, examples, subprocess=False) samples_pyt, samples_np, examples = build_samples_buffer( agent, self.env, self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False, examples=examples) self.samples_pyt = samples_pyt self.samples_np = samples_np if traj_info_kwargs: for k, v in traj_info_kwargs.items(): setattr(self.TrajInfoCls, "_" + k, v) # Avoid passing at init. setattr(self.ReturnTrajInfoCls, "_" + k, v) self.agent_inputs, self.traj_infos = self._decorrelate_envs() # Collector calls start_agent here, but doesn't apply self.agent = agent logger.log("Pomdp Sampler initialized.") return examples
def evaluate_agent(self, itr): """ Record offline evaluation of agent performance, by ``sampler.evaluate_agent()``. """ if itr > 0: self.pbar.stop() if itr >= self.min_itr_learn - 1 or itr == 0: logger.log("Evaluating agent...") self.agent.eval_mode(itr) # Might be agent in sampler. eval_time = -time.time() player_traj_infos, observer_traj_infos = self.sampler.evaluate_agent( itr) eval_time += time.time() else: player_traj_infos = [] observer_traj_infos = [] eval_time = 0.0 logger.log("Evaluation runs complete.") return player_traj_infos, observer_traj_infos, eval_time
def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples, world_size=1, rank=0): """Stores input arguments and initializes replay buffer and optimizer. Use in non-async runners. Computes number of gradient updates per optimization iteration as `(replay_ratio * sampler-batch-size / training-batch_size)`.""" self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = int(self.replay_ratio * sampler_bs / self.batch_size) logger.log(f"From sampler batch size {sampler_bs}, training " f"batch size {self.batch_size}, and replay ratio " f"{self.replay_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = self.min_steps_learn // sampler_bs agent.give_min_itr_learn(self.min_itr_learn) self.initialize_replay_buffer(examples, batch_spec) self.optim_initialize(rank)
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): assert world_size == 1 # world size used in async samplers, not relevant for this class T, B = self.batch_spec self.agent = agent self.env = self.EnvCls(batch_T=T, batch_B=B, **self.env_kwargs) env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(self.env.spaces, share_memory=False, global_B=B, env_ranks=env_ranks) self.samples_pyt, self.samples_np, examples = build_samples_buffer( agent, self.env, self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False, examples=self._get_example_outputs()) self.samples_np.env.done[:-1, :] = False self.samples_np.env.done[-1, :] = True self.traj_info_kwargs = traj_info_kwargs self.agent_inputs = AgentInputs( buffer_from_example(examples["observation"], (B, )), buffer_from_example(examples["action"], (B, )), buffer_from_example(examples["reward"], (B, ))) self._start_agent(B, env_ranks) logger.log("BatchedEpisodicSampler initialized.") return examples
def initialize(self, affinity): p = psutil.Process() if affinity.get("set_affinity", True): p.cpu_affinity(affinity["master_cpus"]) # torch.set_num_threads(affinity["master_torch_threads"]) torch.set_num_threads(1) # Needed to prevent MKL hang :( . B = self.batch_spec.B envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)] sync = AttrDict(db_idx=AttrDict(value=0)) # Mimic the mp.RawValue format. collector = self.CollectorCls( rank=0, envs=envs, samples_np=self.double_buffer, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=self.agent, sync=sync, ) if self.eval_n_envs > 0: eval_envs = [self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs)] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=self.agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) self.agent.to_device(cuda_idx=affinity.get("cuda_idx", None)) self.agent.async_cpu(share_memory=False) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos self.sync = sync logger.log("Serial sampler initialized.")
def start_envs(self, max_decorrelation_steps=0): """Calls ``reset()`` on every environment instance, then steps each one through a random number of random actions, and returns the resulting agent_inputs buffer (`observation`, `prev_action`, `prev_reward`).""" traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, obs in enumerate(observations): observation[b] = obs # numpy array or namedarraytuple prev_action = np.stack([env.action_space.null_value() for env in self.envs]) prev_reward = np.zeros(len(self.envs), dtype="float32") if self.rank == 0: logger.log("Sampler decorrelating envs, max steps: " f"{max_decorrelation_steps}") if max_decorrelation_steps != 0: for b, env in enumerate(self.envs): n_steps = 1 + int(np.random.rand() * max_decorrelation_steps) for _ in range(n_steps): a = env.action_space.sample() o, r, d, info = env.step(a) traj_infos[b].step(o, a, r, d, None, info) if getattr(info, "traj_done", d): o = env.reset() traj_infos[b] = self.TrajInfoCls() if ((type(d) is np.ndarray and d.any()) or (type(d) is bool and d)): a = env.action_space.null_value() r = 0 observation[b] = o prev_action[b] = a prev_reward[b] = r # For action-server samplers. if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None: self.step_buffer_np.observation[:] = observation self.step_buffer_np.action[:] = prev_action self.step_buffer_np.reward[:] = prev_reward return AgentInputs(observation, prev_action, prev_reward), traj_infos
def serve_actions_evaluation(self, itr): obs_ready, act_ready = self.sync.obs_ready, self.sync.act_ready step_np, step_pyt = self.eval_step_buffer_np, self.eval_step_buffer_pyt traj_infos = list() self.agent.reset() agent_inputs = AgentInputs(step_pyt.observation, step_pyt.action, step_pyt.reward) # Fixed buffer objects. for t in range(self.eval_max_T): if t % EVAL_TRAJ_CHECK == 0: # (While workers stepping.) traj_infos.extend(drain_queue(self.eval_traj_infos_queue, guard_sentinel=True)) for b in obs_ready: b.acquire() # assert not b.acquire(block=False) # Debug check. for b_reset in np.where(step_np.done)[0]: step_np.action[b_reset] = 0 # Null prev_action. step_np.reward[b_reset] = 0 # Null prev_reward. self.agent.reset_one(idx=b_reset) action, agent_info = self.agent.step(*agent_inputs) step_np.action[:] = action step_np.agent_info[:] = agent_info if self.eval_max_trajectories is not None and t % EVAL_TRAJ_CHECK == 0: self.sync.stop_eval.value = len(traj_infos) >= self.eval_max_trajectories for w in act_ready: # assert not w.acquire(block=False) # Debug check. w.release() if self.sync.stop_eval.value: logger.log("Evaluation reach max num trajectories " f"({self.eval_max_trajectories}).") break if t == self.eval_max_T - 1 and self.eval_max_trajectories is not None: logger.log("Evaluation reached max num time steps " f"({self.eval_max_T}).") for b in obs_ready: b.acquire() # Workers always do extra release; drain it. assert not b.acquire(block=False) # Debug check. for w in act_ready: assert not w.acquire(block=False) # Debug check. return traj_infos
def log_diagnostics(self, itr, val_info, *args, **kwargs): self.save_itr_snapshot(itr) new_time = time.time() self._cum_time = new_time - self._start_time epochs = itr * self.algo.batch_size / ( self.algo.replay_buffer.size * (1 - self.algo.validation_split)) logger.record_tabular("Iteration", itr) logger.record_tabular("Epochs", epochs) logger.record_tabular("CumTime (s)", self._cum_time) logger.record_tabular("UpdatesPerSecond", itr / self._cum_time) if self._opt_infos: for k, v in self._opt_infos.items(): logger.record_tabular_misc_stat(k, v) for k, v in zip(val_info._fields, val_info): logger.record_tabular_misc_stat("val_" + k, v) self._opt_infos = {k: list() for k in self._opt_infos} # (reset) logger.dump_tabular(with_prefix=False) if itr < self.n_updates - 1: logger.log( f"Optimizing over {self.log_interval_updates} iterations.") self.pbar = ProgBarCounter(self.log_interval_updates)
def initialize_worker(rank, seed=None, cpu=None, torch_threads=None): log_str = f"Sampler rank {rank} initialized" cpu = [cpu] if isinstance(cpu, int) else cpu p = psutil.Process() try: if cpu is not None: p.cpu_affinity(cpu) cpu_affin = p.cpu_affinity() except AttributeError: cpu_affin = "UNAVAILABLE MacOS" log_str += f", CPU affinity {cpu_affin}" torch_threads = (1 if torch_threads is None and cpu is not None else torch_threads) # Default to 1 to avoid possible MKL hang. if torch_threads is not None: torch.set_num_threads(torch_threads) log_str += f", Torch threads {torch.get_num_threads()}" if seed is not None: set_seed(seed) time.sleep(0.3) # (so the printing from set_seed is not intermixed) log_str += f", Seed {seed}" logger.log(log_str)
def evaluate_agent(self, itr): """ 评估模型。 :param itr: 第几次迭代。 :return: 一个tuple,包含trajectory的信息以及evaluation所消耗的时间。 """ if itr > 0: self.pbar.stop() # 停止进度条 if itr >= self.min_itr_learn - 1 or itr == 0: logger.log("Evaluating agent...") self.agent.eval_mode(itr) # Might be agent in sampler. eval_time = -time.time() traj_infos = self.sampler.evaluate_agent(itr) # 真正开始做evaluation的地方 eval_time += time.time() # 经过这么一计算,现在eval_time变成了上一条语句的执行消耗时间 else: traj_infos = [] eval_time = 0.0 logger.log("Evaluation runs complete.") return traj_infos, eval_time
def data_parallel(self): """ Wraps the intrinsic bonus model with PyTorch's DistributedDataParallel. The intention is for rlpyt to create a separate Python process to drive each GPU (or CPU-group for CPU-only, MPI-like configuration). Typically called in the runner during startup. """ super().data_parallel() if self.device.type == "cpu": self.bonus_model = DDPC(self.bonus_model) logger.log( "Initialized DistributedDataParallelCPU intrinsic bonus model." ) else: self.bonus_model = DDP(self.bonus_model, device_ids=[self.device.index], output_device=self.device.index) logger.log( f"Initialized DistributedDataParallel intrinsic bonus model on device {self.device}." )
def shutdown(self): self.pbar.stop() logger.log("Master optimizer shutting down, joining sampler process...") self.sampler_proc.join() logger.log("Joining memory copiers...") for p in self.memcpy_procs: p.join() if self.ctrl.opt_throttle is not None: logger.log("Joining optimizer processes...") self.ctrl.quit_opt.value = True self.ctrl.opt_throttle.wait() for p in self.optimizer_procs: p.join() logger.log("All processes shutdown. Training complete.")
def __init__( self, image_shape, action_size, hidden_sizes=512, stop_conv_grad=False, channels=None, # Defaults below. kernel_sizes=None, strides=None, paddings=None, kiaming_init=True, normalize_conv_out=False, ): super().__init__() c, h, w = image_shape self.conv = Conv2dModel( in_channels=c, channels=channels or [32, 64, 64], kernel_sizes=kernel_sizes or [8, 4, 3], strides=strides or [4, 2, 1], paddings=paddings, ) self._conv_out_size = self.conv.conv_out_size(h=h, w=w) self.pi_v_mlp = MlpModel( input_size=self._conv_out_size, hidden_sizes=hidden_sizes, output_size=action_size + 1, ) if kiaming_init: self.apply(weight_init) self.stop_conv_grad = stop_conv_grad logger.log("Model stopping gradient at CONV." if stop_conv_grad else "Modeul using gradients on all parameters.") if normalize_conv_out: # Havent' seen this make a difference yet. logger.log("Model normalizing conv output across all pixels.") self.conv_rms = RunningMeanStdModel((1, )) self.var_clip = 1e-6 self.normalize_conv_out = normalize_conv_out