def _startup(self): """ Setup the system and connect all components """ # system setup logger.log(f"Runner{getattr(self, 'rank', '')} master Torch threads: " f"{torch.get_num_threads()}.") # view cuda configuration for this environment logger.log(f"Runner{getattr(self, 'rank', '')} CUDA_VISIBLE_DEVICES: " f"{os.environ.get('CUDA_VISIBLE_DEVICES', '')}.") set_gpu_from_visibles(self.affinity.get("cuda_idx", 0)) # components setup from examples traj_example, info_example, env_space = self.sampler.make_trajectory_example( ) self.agent.initialize(*env_space) self.algo.initialize(self.agent) self.sampler.initialize(self.agent) # post components setup if torch.cuda.is_available(): device = torch.device("cuda") self.agent.to(device=device) self.agent.sample_mode() # logging memory setup self._env_infos = {k: list() for k in info_example._fields} self._train_infos = {k: list() for k in self.algo.train_info_fields} self.epoch_i = 0
def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples, world_size=1, rank=0): """Stores input arguments and initializes replay buffer and optimizer. Use in non-async runners. Computes number of gradient updates per optimization iteration as `(replay_ratio * sampler-batch-size / training-batch_size)`.""" self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = max( 1, round(self.replay_ratio * sampler_bs / self.batch_size)) logger.log( f"From sampler batch size {sampler_bs}, training " f"batch size {self.batch_size}, and replay ratio " f"{self.replay_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = int(self.min_steps_learn // sampler_bs) # Agent give min itr learn.? self.initialize_replay_buffer(examples, batch_spec) self.optim_initialize(rank)
def _get_n_envs_lists(self, affinity): for aff in affinity: assert aff.get("alternating", False), "Need alternating affinity." B = self.batch_spec.B n_server = len(affinity) n_workers = [len(aff["workers_cpus"]) for aff in affinity] if B < n_server: raise ValueError(f"Request fewer envs ({B}) than action servers " f"({n_server}).") server_Bs = [B // n_server] * n_server if n_workers.count(n_workers[0]) != len(n_workers): logger.log( "WARNING: affinity requested different number of " "environment workers per action server, but environments " "will be assigned equally across action servers anyway.") if B % n_server > 0: assert (B % n_server) % 2 == 0, "Need even num extra envs per server." for s in range((B % n_server) // 2): server_Bs[s] += 2 # Spread across action servers in pairs. n_envs_lists = list() for s_worker, s_B in zip(n_workers, server_Bs): n_envs_lists.append(self._get_n_envs_list(n_worker=s_worker, B=s_B)) return n_envs_lists
def initialize_worker(rank, seed=None, cpu=None, torch_threads=None): """Assign CPU affinity, set random seed, set torch_threads if needed to prevent MKL deadlock. """ log_str = f"Sampler rank {rank} initialized" cpu = [cpu] if isinstance(cpu, int) else cpu p = psutil.Process() try: if cpu is not None: p.cpu_affinity(cpu) cpu_affin = p.cpu_affinity() except AttributeError: cpu_affin = "UNAVAILABLE MacOS" log_str += f", CPU affinity {cpu_affin}" torch_threads = ( 1 if torch_threads is None and cpu is not None else torch_threads ) # Default to 1 to avoid possible MKL hang. if torch_threads is not None: torch.set_num_threads(torch_threads) log_str += f", Torch threads {torch.get_num_threads()}" if seed is not None: set_seed(seed) time.sleep(0.3) # (so the printing from set_seed is not intermixed) log_str += f", Seed {seed}" logger.log(log_str)
def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples, world_size=1, rank=0): """Used in basic or synchronous multi-GPU runners, not async. Parameters ---------- agent: SacAgent """ self.agent = agent self.n_itr = n_itr self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size self.updates_per_optimize = int(self.replay_ratio * sampler_bs / self.batch_size) logger.log( f"From sampler batch size {sampler_bs}, training " f"batch size {self.batch_size}, and replay ratio " f"{self.replay_ratio}, computed {self.updates_per_optimize} " f"updates per iteration.") self.min_itr_learn = self.min_steps_learn // sampler_bs agent.give_min_itr_learn(self.min_itr_learn) self.initialize_replay_buffer(examples, batch_spec) self.optim_initialize(rank)
def run_async_sampler_eval(sampler, affinity, ctrl, traj_infos_queue, n_itr, eval_itrs): sampler.initialize(affinity) db_idx = 0 for itr in range(n_itr + 1): # +1 to get last eval :) ctrl.sample_copied[db_idx].acquire() # assert not ctrl.sample_copied[db_idx].acquire(block=False) # Debug check. sampler.obtain_samples(itr, db_idx) ctrl.sample_ready[db_idx].release() if itr % eval_itrs == 0: eval_time = -time.time() traj_infos = sampler.evaluate_agent(itr) eval_time += time.time() ctrl.eval_time.value += eval_time # Not atomic but only writer. with ctrl.sampler_itr.get_lock(): for traj_info in traj_infos: traj_infos_queue.put(traj_info) traj_infos_queue.put(None) # Master will get until None sentinel. ctrl.sampler_itr.value = itr else: ctrl.sampler_itr.value = itr db_idx ^= 1 # Double buffer logger.log(f"Async sampler reached final itr: {itr + 1}, quitting.") ctrl.quit.value = True # This ends the experiment. sampler.shutdown() for s in ctrl.sample_ready: s.release() # Let memcpy workers finish and quit.
def build_and_train(affinity_code, log_dir, run_ID, **kwargs): # I prefer put all tunable default configs into launch file # acquire affinity asigned by the launcher. # NOTE: If the affinity is a list, it means multiple resources (gpu) # is assigned to current experiment affinity = affinity_from_code(affinity_code) # now you will have `config` as a dictionary in the same # structure you define your default configurations config = load_variant(log_dir) name = "demo_experiment" # This helps you know what GPU is recommand to you for this experiment gpu_idx = affinity["cuda_idx"] # under a logger context, run your experiment. with logger_context(log_dir, run_ID, name, config): logger.log("Start running experiment") for epoch_i in range(10): # log your scalar with this function for example logger.record_tabular("metric1", epoch_i, epoch_i) # dump all logs into csv file (This is the exact function that # write one line into progress.csv file) logger.dump_tabular()
def initialize_replay_buffer(self, examples, batch_spec, async_=False): example_to_buffer = SamplesToBuffer( observation=examples["observation"], action=examples["action"], reward=examples["reward"], done=examples["done"], timeout=getattr(examples["env_info"], "timeout", None), ) replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, n_step_return=self.n_step_return, ) if not self.bootstrap_timelimit: ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer else: ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer if self.ReplayBufferCls is not None: ReplayCls = self.ReplayBufferCls logger.log( f"WARNING: ignoring internal selection logic and using" f" input replay buffer class: {ReplayCls} -- compatibility not" " guaranteed.") self.replay_buffer = ReplayCls(**replay_kwargs)
def evaluate_agent(self, itr): """Signal worker processes to perform agent evaluation. If a max number of evaluation trajectories was specified, keep watch over the number of trajectories finished and signal an early end if the limit is reached. Return a list of trajectory-info objects from the completed episodes. """ self.ctrl.itr.value = itr self.ctrl.do_eval.value = True self.sync.stop_eval.value = False self.ctrl.barrier_in.wait() traj_infos = list() if self.eval_max_trajectories is not None: while True: time.sleep(EVAL_TRAJ_CHECK) traj_infos.extend( drain_queue(self.eval_traj_infos_queue, guard_sentinel=True)) if len(traj_infos) >= self.eval_max_trajectories: self.sync.stop_eval.value = True logger.log("Evaluation reached max num trajectories " f"({self.eval_max_trajectories}).") break # Stop possibly before workers reach max_T. if self.ctrl.barrier_out.parties - self.ctrl.barrier_out.n_waiting == 1: logger.log("Evaluation reached max num time steps " f"({self.eval_max_T}).") break # Workers reached max_T. self.ctrl.barrier_out.wait() traj_infos.extend( drain_queue(self.eval_traj_infos_queue, n_sentinel=self.n_worker)) self.ctrl.do_eval.value = False return traj_infos
def initialize_replay_buffer(self, examples, batch_spec, async_=False): """ Allocates replay buffer using examples and with the fields in `SamplesToBuffer` namedarraytuple. Uses frame-wise buffers, so that only unique frames are stored, using less memory (usual observations are 4 most recent frames, with only newest frame distince from previous observation). """ example_to_buffer = self.examples_to_buffer(examples) replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, discount=self.discount, n_step_return=self.n_step_return, ) if self.prioritized_replay: replay_kwargs.update( dict( alpha=self.pri_alpha, beta=self.pri_beta_init, default_priority=self.default_priority, )) ReplayCls = (AsyncPrioritizedReplayFrameBuffer if async_ else PrioritizedReplayFrameBuffer) else: ReplayCls = (AsyncUniformReplayFrameBuffer if async_ else UniformReplayFrameBuffer) if self.ReplayBufferCls is not None: ReplayCls = self.ReplayBufferCls logger.log( f"WARNING: ignoring internal selection logic and using" f" input replay buffer class: {ReplayCls} -- compatibility not" " guaranteed.") self.replay_buffer = ReplayCls(**replay_kwargs)
def sample_mode(self, itr): super().sample_mode(itr) self.q2_model.eval() std = self.action_std if itr >= self.min_itr_learn else self.pretrain_std if itr == 0 or itr == self.min_itr_learn: logger.log(f"Agent at itr {itr}, sample std: {std}.") self.distribution.set_std(std)
def initialize(self, affinity): """Initialization inside the main sampler process. Builds one level of parallel synchronization objects, and forks action-server processes, one per GPU to be used. """ torch.set_num_threads(1) # Needed to avoid MKL hang :( . self.world_size = n_server = len(affinity) n_envs_lists = self._get_n_envs_lists(affinity) n_server = len(n_envs_lists) n_worker = sum([len(n_envs_list) for n_envs_list in n_envs_lists]) self.n_worker = n_worker if self.eval_n_envs > 0: self.eval_n_envs_per = max(1, self.eval_n_envs // n_worker) self.eval_n_envs = eval_n_envs = self.eval_n_envs_per * n_worker logger.log(f"Total parallel evaluation envs: {eval_n_envs}.") self.eval_max_T = eval_max_T = int(self.eval_max_steps // eval_n_envs) self._build_parallel_ctrl(n_server, n_worker) servers_kwargs = self._assemble_servers_kwargs(affinity, self.seed, n_envs_lists) servers = [mp.Process(target=self.action_server_process, kwargs=s_kwargs) for s_kwargs in servers_kwargs] for s in servers: s.start() self.servers = servers self.ctrl.barrier_out.wait() # Wait for workers to decorrelate envs.
def initialize_logging(self): self._traj_infos = deque(maxlen=self.log_traj_window) self._cum_completed_trajs = 0 self._new_completed_trajs = 0 super().initialize_logging() logger.log(f"Optimizing over {self.log_interval_itrs} sampler " "iterations.") self.pbar = ProgBarCounter(self.log_interval_itrs)
def get_n_itr(self): log_interval_itrs = max(self.log_interval_steps // self.sampler_batch_size, 1) n_itr = math.ceil(self.n_steps / self.log_interval_steps) * log_interval_itrs self.log_interval_itrs = log_interval_itrs self.n_itr = n_itr logger.log(f"Running {n_itr} sampler iterations.") return n_itr
def save_itr_snapshot(self, itr): """ Calls the logger to save training checkpoint/snapshot (logger itself may or may not save, depending on mode selected). """ logger.log("saving snapshot...") params = self.get_itr_snapshot(itr) logger.save_itr_params(itr, params) logger.log("saved")
def _save_epoch_snapshot(self, epoch_i): """ Calls the logger to save training checkpoint/snapshot (logger itself may or may not save, depending on mode selected). """ logger.log("saving snapshot...") params = self._get_epoch_snapshot(epoch_i) logger.save_itr_params(epoch_i, params) logger.log("saved")
def _load_snapshot(self, filename): """ A method to load parameters from snapshot and keep on training NOTE: filename has to be absolute path. And this has to be done after _startup """ state_dict = torch.load(filename) logger.log("Loading snapshot from {}".format(filename)) self.epoch_i = state_dict["epoch_i"] self.agent.load_state_dict(state_dict["agent_state_dict"]) self.algo.load_state_dict(state_dict["algo_state_dict"])
def log_diagnostics(self, itr, sampler_itr, throttle_time, prefix='Diagnostics/'): if not self._traj_infos: logger.log("WARNING: had no complete trajectories in eval.") steps_in_eval = sum([info["Length"] for info in self._traj_infos]) with logger.tabular_prefix(prefix): logger.record_tabular('StepsInEval', steps_in_eval) logger.record_tabular('TrajsInEval', len(self._traj_infos)) logger.record_tabular('CumEvalTime', self.ctrl.eval_time.value) super().log_diagnostics(itr, sampler_itr, throttle_time, prefix=prefix) self._traj_infos = list() # Clear after each eval.
def serve_actions_evaluation(self, itr): obs_ready, act_ready = self.sync.obs_ready, self.sync.act_ready obs_ready_pair = self.obs_ready_pair act_ready_pair = self.act_ready_pair step_np_pair = self.eval_step_buffer_np_pair agent_inputs_pair = self.eval_agent_inputs_pair traj_infos = list() self.agent.reset() stop = False for t in range(self.eval_max_T): if t % EVAL_TRAJ_CHECK == 0: # (While workers stepping.) traj_infos.extend(drain_queue(self.eval_traj_infos_queue, guard_sentinel=True)) for alt in range(2): step_h = step_np_pair[alt] for b in obs_ready_pair[alt]: b.acquire() # assert not b.acquire(block=False) # Debug check. for b_reset in np.where(step_h.done)[0]: step_h.action[b_reset] = 0 # Null prev_action. step_h.reward[b_reset] = 0 # Null prev_reward. self.agent.reset_one(idx=b_reset) action, agent_info = self.agent.step(*agent_inputs_pair[alt]) step_h.action[:] = action step_h.agent_info[:] = agent_info if (self.eval_max_trajectories is not None and t % EVAL_TRAJ_CHECK == 0 and alt == 0): if len(traj_infos) >= self.eval_max_trajectories: for b in obs_ready_pair[1 - alt]: b.acquire() # Now all workers waiting. self.sync.stop_eval.value = stop = True for w in act_ready[alt]: w.release() break for w in act_ready_pair[alt]: # assert not w.acquire(block=False) # Debug check. w.release() if stop: logger.log("Evaluation reached max num trajectories " f"({self.eval_max_trajectories}).") break # TODO: check exit logic for/while ..? if not stop: logger.log("Evaluation reached max num time steps " f"({self.eval_max_T}).") for b in obs_ready: b.acquire() # Workers always do extra release; drain it. assert not b.acquire(block=False) # Debug check. for w in act_ready: assert not w.acquire(block=False) # Debug check. return traj_infos
def sample_mode(self, itr): super().sample_mode(itr) self.q1_model.eval() self.q2_model.eval() self.v_model.eval() if itr == 0: logger.log(f"Agent at itr {itr}, sample std: {self.pretrain_std}") if itr == self.min_itr_learn: logger.log(f"Agent at itr {itr}, sample std: learned.") std = None if itr >= self.min_itr_learn else self.pretrain_std self.distribution.set_std(std) # If None: std from policy dist_info.
def evaluate_agent(self, itr): """ Record offline evaluation of agent performance, by ``sampler.evaluate_agent()``. """ if itr > 0: self.pbar.stop() logger.log("Evaluating agent...") self.agent.eval_mode(itr) # Might be agent in sampler. eval_time = -time.time() traj_infos = self.sampler.evaluate_agent(itr) eval_time += time.time() logger.log("Evaluation runs complete.") return traj_infos, eval_time
def log_diagnostics(self, itr, traj_infos=None, eval_time=0, prefix='Diagnostics/'): """ Write diagnostics (including stored ones) to csv via the logger. """ if itr > 0: self.pbar.stop() self.save_itr_snapshot(itr) new_time = time.time() self._cum_time = new_time - self._start_time train_time_elapsed = new_time - self._last_time - eval_time new_updates = self.algo.update_counter - self._last_update_counter new_samples = (self.sampler.batch_size * self.world_size * self.log_interval_itrs) updates_per_second = (float('nan') if itr == 0 else new_updates / train_time_elapsed) samples_per_second = (float('nan') if itr == 0 else new_samples / train_time_elapsed) replay_ratio = (new_updates * self.algo.batch_size * self.world_size / new_samples) cum_replay_ratio = (self.algo.batch_size * self.algo.update_counter / ((itr + 1) * self.sampler.batch_size) ) # world_size cancels. cum_steps = (itr + 1) * self.sampler.batch_size * self.world_size with logger.tabular_prefix(prefix): if self._eval: logger.record_tabular( 'CumTrainTime', self._cum_time - self._cum_eval_time) # Already added new eval_time. logger.record_tabular('Iteration', itr) logger.record_tabular('CumTime (s)', self._cum_time) logger.record_tabular('CumSteps', cum_steps) logger.record_tabular('CumCompletedTrajs', self._cum_completed_trajs) logger.record_tabular('CumUpdates', self.algo.update_counter) logger.record_tabular('StepsPerSecond', samples_per_second) logger.record_tabular('UpdatesPerSecond', updates_per_second) logger.record_tabular('ReplayRatio', replay_ratio) logger.record_tabular('CumReplayRatio', cum_replay_ratio) self._log_infos(traj_infos) logger.dump_tabular(with_prefix=False) self._last_time = new_time self._last_update_counter = self.algo.update_counter if itr < self.n_itr - 1: logger.log(f"Optimizing over {self.log_interval_itrs} iterations.") self.pbar = ProgBarCounter(self.log_interval_itrs)
def startup(self): """ Sets hardware affinities, initializes the following: 1) sampler (which should initialize the agent), 2) agent device and data-parallel wrapper (if applicable), 3) algorithm, 4) logger. """ p = psutil.Process() try: if (self.affinity.get("master_cpus", None) is not None and self.affinity.get("set_affinity", True)): p.cpu_affinity(self.affinity["master_cpus"]) cpu_affin = p.cpu_affinity() except AttributeError: cpu_affin = "UNAVAILABLE MacOS" logger.log(f"Runner {getattr(self, 'rank', '')} master CPU affinity: " f"{cpu_affin}.") if self.affinity.get("master_torch_threads", None) is not None: torch.set_num_threads(self.affinity["master_torch_threads"]) logger.log(f"Runner {getattr(self, 'rank', '')} master Torch threads: " f"{torch.get_num_threads()}.") if self.seed is None: self.seed = make_seed() set_seed(self.seed) self.rank = rank = getattr(self, "rank", 0) self.world_size = world_size = getattr(self, "world_size", 1) examples = self.sampler.initialize( agent=self.agent, # Agent gets initialized in sampler. affinity=self.affinity, seed=self.seed + 1, bootstrap_value=getattr(self.algo, "bootstrap_value", False), traj_info_kwargs=self.get_traj_info_kwargs(), rank=rank, world_size=world_size, ) self.itr_batch_size = self.sampler.batch_spec.size * world_size n_itr = self.get_n_itr() self.agent.to_device(self.affinity.get("cuda_idx", None)) if world_size > 1: self.agent.data_parallel() self.algo.initialize( agent=self.agent, n_itr=n_itr, batch_spec=self.sampler.batch_spec, mid_batch_reset=self.sampler.mid_batch_reset, examples=examples, world_size=world_size, rank=rank, ) self.initialize_logging() return n_itr
def log_diagnostics(self, itr, eval_traj_infos, eval_time, prefix='Diagnostics/'): if not eval_traj_infos: logger.log("WARNING: had no complete trajectories in eval.") steps_in_eval = sum([info["Length"] for info in eval_traj_infos]) with logger.tabular_prefix(prefix): logger.record_tabular('StepsInEval', steps_in_eval) logger.record_tabular('TrajsInEval', len(eval_traj_infos)) self._cum_eval_time += eval_time logger.record_tabular('CumEvalTime', self._cum_eval_time) super().log_diagnostics(itr, eval_traj_infos, eval_time, prefix=prefix)
def initialize(self, affinity): """Initialization inside the main sampler process. Sets process hardware affinities, creates specified number of environment instances and instantiates the collector with them. If applicable, does the same for evaluation environment instances. Moves the agent to device (could be GPU), and calls on ``agent.async_cpu()`` initialization. Starts up collector. """ p = psutil.Process() if affinity.get("set_affinity", True): p.cpu_affinity(affinity["master_cpus"]) # torch.set_num_threads(affinity["master_torch_threads"]) torch.set_num_threads(1) # Needed to prevent MKL hang :( . B = self.batch_spec.B envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)] sync = AttrDict( db_idx=AttrDict(value=0)) # Mimic the mp.RawValue format. collector = self.CollectorCls( rank=0, envs=envs, samples_np=self.double_buffer, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=self.agent, sync=sync, ) if self.eval_n_envs > 0: eval_envs = [ self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs) ] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=self.agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) self.agent.to_device(cuda_idx=affinity.get("cuda_idx", None)) self.agent.async_cpu(share_memory=False) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos self.sync = sync logger.log("Serial sampler initialized.")
def get_n_itr(self): """ Determine number of train loop iterations to run. Converts logging interval units from environment steps to iterations. """ # Log at least as often as requested (round down itrs): log_interval_itrs = max(self.log_interval_steps // self.itr_batch_size, 1) # FIXME: To run at least as many steps as requested, round up log interval? n_itr = math.ceil( self.n_steps / self.log_interval_steps) * log_interval_itrs self.log_interval_itrs = log_interval_itrs self.n_itr = n_itr logger.log(f"Running {n_itr} iterations of minibatch RL.") return n_itr
def log_diagnostics(self, itr, sampler_itr, throttle_time, prefix='Diagnostics/'): self.pbar.stop() self.save_itr_snapshot(itr, sampler_itr) new_time = time.time() time_elapsed = new_time - self._last_time new_updates = self.algo.update_counter - self._last_update_counter new_samples = self.sampler.batch_size * (sampler_itr - self._last_sampler_itr) updates_per_second = (float('nan') if itr == 0 else new_updates / time_elapsed) samples_per_second = (float('nan') if itr == 0 else new_samples / time_elapsed) if self._eval: new_eval_time = self.ctrl.eval_time.value eval_time_elapsed = new_eval_time - self._last_eval_time non_eval_time_elapsed = time_elapsed - eval_time_elapsed non_eval_samples_per_second = (float('nan') if itr == 0 else new_samples / non_eval_time_elapsed) self._last_eval_time = new_eval_time cum_steps = sampler_itr * self.sampler.batch_size # No * world_size. replay_ratio = (new_updates * self.algo.batch_size * self.world_size / max(1, new_samples)) cum_replay_ratio = (self.algo.update_counter * self.algo.batch_size * self.world_size / max(1, cum_steps)) with logger.tabular_prefix(prefix): logger.record_tabular('Iteration', itr) logger.record_tabular('SamplerIteration', sampler_itr) logger.record_tabular('CumTime (s)', new_time - self._start_time) logger.record_tabular('CumSteps', cum_steps) logger.record_tabular('CumUpdates', self.algo.update_counter) logger.record_tabular('ReplayRatio', replay_ratio) logger.record_tabular('CumReplayRatio', cum_replay_ratio) logger.record_tabular('StepsPerSecond', samples_per_second) if self._eval: logger.record_tabular('NonEvalSamplesPerSecond', non_eval_samples_per_second) logger.record_tabular('UpdatesPerSecond', updates_per_second) logger.record_tabular('OptThrottle', (time_elapsed - throttle_time) / time_elapsed) self._log_infos() self._last_time = new_time self._last_itr = itr self._last_sampler_itr = sampler_itr self._last_update_counter = self.algo.update_counter logger.dump_tabular(with_prefix=False) logger.log(f"Optimizing over {self.log_interval_itrs} sampler " "iterations.") self.pbar = ProgBarCounter(self.log_interval_itrs)
def memory_copier(sample_buffer, samples_to_buffer, replay_buffer, ctrl): # Needed on some systems to avoid mysterious hang. # (Experienced hang on Ubuntu Server 16.04 machines (but not Desktop) when # appending samples to make replay buffer full, but only for batch_B > 84 # (dqn + r2d1 atari), regardless of replay size or batch_T. Would seem to # progress through all code in replay.append_samples() but simply would # not return from it. Some tipping point for MKL threading?) torch.set_num_threads(1) while True: ctrl.sample_ready.acquire() # assert not ctrl.sample_ready.acquire(block=False) # Debug check. if ctrl.quit.value: break replay_buffer.append_samples(samples_to_buffer(sample_buffer)) ctrl.sample_copied.release() logger.log("Memory copier shutting down.")
def run_async_sampler(sampler, affinity, ctrl, traj_infos_queue, n_itr): sampler.initialize(affinity) db_idx = 0 for itr in range(n_itr): ctrl.sample_copied[db_idx].acquire() traj_infos = sampler.obtain_samples(itr, db_idx) ctrl.sample_ready[db_idx].release() with ctrl.sampler_itr.get_lock(): for traj_info in traj_infos: traj_infos_queue.put(traj_info) ctrl.sampler_itr.value = itr db_idx ^= 1 # Double buffer. logger.log(f"Async sampler reached final itr: {itr + 1}, quitting.") ctrl.quit.value = True # This ends the experiment. sampler.shutdown() for s in ctrl.sample_ready: s.release() # Let memcpy workers finish and quit.
def initialize_replay_buffer(self, examples, batch_spec, async_=False): """Similar to DQN but uses replay buffers which return sequences, and stores the agent's recurrent state.""" example_to_buffer = SamplesToBuffer( observation=examples["observation"], action=examples["action"], reward=examples["reward"], done=examples["done"], ) if self.store_rnn_state_interval > 0: example_to_buffer = SamplesToBufferRnn( *example_to_buffer, prev_rnn_state=examples["agent_info"].prev_rnn_state, ) replay_kwargs = dict( example=example_to_buffer, size=self.replay_size, B=batch_spec.B, discount=self.discount, n_step_return=self.n_step_return, rnn_state_interval=self.store_rnn_state_interval, # batch_T fixed for prioritized, (relax if rnn_state_interval=1 or 0). batch_T=self.batch_T + self.warmup_T, ) if self.prioritized_replay: replay_kwargs.update( dict( alpha=self.pri_alpha, beta=self.pri_beta_init, default_priority=self.default_priority, input_priorities=self.input_priorities, # True/False. input_priority_shift=self.input_priority_shift, )) ReplayCls = (AsyncPrioritizedSequenceReplayFrameBuffer if async_ else PrioritizedSequenceReplayFrameBuffer) else: ReplayCls = (AsyncUniformSequenceReplayFrameBuffer if async_ else UniformSequenceReplayFrameBuffer) if self.ReplayBufferCls is not None: ReplayCls = self.ReplayBufferCls logger.log( f"WARNING: ignoring internal selection logic and using" f" input replay buffer class: {ReplayCls} -- compatibility not" " guaranteed.") self.replay_buffer = ReplayCls(**replay_kwargs) return self.replay_buffer