Example #1
0
    def _startup(self):
        """ Setup the system and connect all components
        """
        # system setup
        logger.log(f"Runner{getattr(self, 'rank', '')} master Torch threads: "
                   f"{torch.get_num_threads()}.")
        # view cuda configuration for this environment
        logger.log(f"Runner{getattr(self, 'rank', '')} CUDA_VISIBLE_DEVICES: "
                   f"{os.environ.get('CUDA_VISIBLE_DEVICES', '')}.")
        set_gpu_from_visibles(self.affinity.get("cuda_idx", 0))

        # components setup from examples
        traj_example, info_example, env_space = self.sampler.make_trajectory_example(
        )

        self.agent.initialize(*env_space)
        self.algo.initialize(self.agent)
        self.sampler.initialize(self.agent)

        # post components setup
        if torch.cuda.is_available():
            device = torch.device("cuda")
            self.agent.to(device=device)
        self.agent.sample_mode()

        # logging memory setup
        self._env_infos = {k: list() for k in info_example._fields}
        self._train_infos = {k: list() for k in self.algo.train_info_fields}
        self.epoch_i = 0
Example #2
0
 def initialize(self,
                agent,
                n_itr,
                batch_spec,
                mid_batch_reset,
                examples,
                world_size=1,
                rank=0):
     """Stores input arguments and initializes replay buffer and optimizer.
     Use in non-async runners.  Computes number of gradient updates per
     optimization iteration as `(replay_ratio * sampler-batch-size /
     training-batch_size)`."""
     self.agent = agent
     self.n_itr = n_itr
     self.mid_batch_reset = mid_batch_reset
     self.sampler_bs = sampler_bs = batch_spec.size
     self.updates_per_optimize = max(
         1, round(self.replay_ratio * sampler_bs / self.batch_size))
     logger.log(
         f"From sampler batch size {sampler_bs}, training "
         f"batch size {self.batch_size}, and replay ratio "
         f"{self.replay_ratio}, computed {self.updates_per_optimize} "
         f"updates per iteration.")
     self.min_itr_learn = int(self.min_steps_learn // sampler_bs)
     # Agent give min itr learn.?
     self.initialize_replay_buffer(examples, batch_spec)
     self.optim_initialize(rank)
Example #3
0
    def _get_n_envs_lists(self, affinity):
        for aff in affinity:
            assert aff.get("alternating", False), "Need alternating affinity."
        B = self.batch_spec.B
        n_server = len(affinity)
        n_workers = [len(aff["workers_cpus"]) for aff in affinity]
        if B < n_server:
            raise ValueError(f"Request fewer envs ({B}) than action servers "
                             f"({n_server}).")
        server_Bs = [B // n_server] * n_server
        if n_workers.count(n_workers[0]) != len(n_workers):
            logger.log(
                "WARNING: affinity requested different number of "
                "environment workers per action server, but environments "
                "will be assigned equally across action servers anyway.")
        if B % n_server > 0:
            assert (B %
                    n_server) % 2 == 0, "Need even num extra envs per server."
            for s in range((B % n_server) // 2):
                server_Bs[s] += 2  # Spread across action servers in pairs.

        n_envs_lists = list()
        for s_worker, s_B in zip(n_workers, server_Bs):
            n_envs_lists.append(self._get_n_envs_list(n_worker=s_worker,
                                                      B=s_B))

        return n_envs_lists
Example #4
0
def initialize_worker(rank, seed=None, cpu=None, torch_threads=None):
    """Assign CPU affinity, set random seed, set torch_threads if needed to
    prevent MKL deadlock.
    """
    log_str = f"Sampler rank {rank} initialized"
    cpu = [cpu] if isinstance(cpu, int) else cpu
    p = psutil.Process()
    try:
        if cpu is not None:
            p.cpu_affinity(cpu)
        cpu_affin = p.cpu_affinity()
    except AttributeError:
        cpu_affin = "UNAVAILABLE MacOS"
    log_str += f", CPU affinity {cpu_affin}"
    torch_threads = (
        1 if torch_threads is None and cpu is not None else torch_threads
    )  # Default to 1 to avoid possible MKL hang.
    if torch_threads is not None:
        torch.set_num_threads(torch_threads)
    log_str += f", Torch threads {torch.get_num_threads()}"
    if seed is not None:
        set_seed(seed)
        time.sleep(0.3)  # (so the printing from set_seed is not intermixed)
        log_str += f", Seed {seed}"
    logger.log(log_str)
Example #5
0
 def initialize(self,
                agent,
                n_itr,
                batch_spec,
                mid_batch_reset,
                examples,
                world_size=1,
                rank=0):
     """Used in basic or synchronous multi-GPU runners, not async.
     Parameters
     ----------
         agent: SacAgent
     """
     self.agent = agent
     self.n_itr = n_itr
     self.mid_batch_reset = mid_batch_reset
     self.sampler_bs = sampler_bs = batch_spec.size
     self.updates_per_optimize = int(self.replay_ratio * sampler_bs /
                                     self.batch_size)
     logger.log(
         f"From sampler batch size {sampler_bs}, training "
         f"batch size {self.batch_size}, and replay ratio "
         f"{self.replay_ratio}, computed {self.updates_per_optimize} "
         f"updates per iteration.")
     self.min_itr_learn = self.min_steps_learn // sampler_bs
     agent.give_min_itr_learn(self.min_itr_learn)
     self.initialize_replay_buffer(examples, batch_spec)
     self.optim_initialize(rank)
Example #6
0
def run_async_sampler_eval(sampler, affinity, ctrl, traj_infos_queue,
        n_itr, eval_itrs):
    sampler.initialize(affinity)
    db_idx = 0
    for itr in range(n_itr + 1):  # +1 to get last eval :)
        ctrl.sample_copied[db_idx].acquire()
        # assert not ctrl.sample_copied[db_idx].acquire(block=False)  # Debug check.
        sampler.obtain_samples(itr, db_idx)
        ctrl.sample_ready[db_idx].release()
        if itr % eval_itrs == 0:
            eval_time = -time.time()
            traj_infos = sampler.evaluate_agent(itr)
            eval_time += time.time()
            ctrl.eval_time.value += eval_time  # Not atomic but only writer.
            with ctrl.sampler_itr.get_lock():
                for traj_info in traj_infos:
                    traj_infos_queue.put(traj_info)
                traj_infos_queue.put(None)  # Master will get until None sentinel.
                ctrl.sampler_itr.value = itr
        else:
            ctrl.sampler_itr.value = itr
        db_idx ^= 1  # Double buffer
    logger.log(f"Async sampler reached final itr: {itr + 1}, quitting.")
    ctrl.quit.value = True  # This ends the experiment.
    sampler.shutdown()
    for s in ctrl.sample_ready:
        s.release()  # Let memcpy workers finish and quit.
Example #7
0
def build_and_train(affinity_code, log_dir, run_ID, **kwargs):
    # I prefer put all tunable default configs into launch file

    # acquire affinity asigned by the launcher.
    # NOTE: If the affinity is a list, it means multiple resources (gpu)
    # is assigned to current experiment
    affinity = affinity_from_code(affinity_code)

    # now you will have `config` as a dictionary in the same
    # structure you define your default configurations
    config = load_variant(log_dir)

    name = "demo_experiment"
    # This helps you know what GPU is recommand to you for this experiment
    gpu_idx = affinity["cuda_idx"]

    # under a logger context, run your experiment.
    with logger_context(log_dir, run_ID, name, config):
        logger.log("Start running experiment")
        for epoch_i in range(10):
            # log your scalar with this function for example
            logger.record_tabular("metric1", epoch_i, epoch_i)
            # dump all logs into csv file (This is the exact function that
            # write one line into progress.csv file)
            logger.dump_tabular()
Example #8
0
 def initialize_replay_buffer(self, examples, batch_spec, async_=False):
     example_to_buffer = SamplesToBuffer(
         observation=examples["observation"],
         action=examples["action"],
         reward=examples["reward"],
         done=examples["done"],
         timeout=getattr(examples["env_info"], "timeout", None),
     )
     replay_kwargs = dict(
         example=example_to_buffer,
         size=self.replay_size,
         B=batch_spec.B,
         n_step_return=self.n_step_return,
     )
     if not self.bootstrap_timelimit:
         ReplayCls = AsyncUniformReplayBuffer if async_ else UniformReplayBuffer
     else:
         ReplayCls = AsyncTlUniformReplayBuffer if async_ else TlUniformReplayBuffer
     if self.ReplayBufferCls is not None:
         ReplayCls = self.ReplayBufferCls
         logger.log(
             f"WARNING: ignoring internal selection logic and using"
             f" input replay buffer class: {ReplayCls} -- compatibility not"
             " guaranteed.")
     self.replay_buffer = ReplayCls(**replay_kwargs)
Example #9
0
 def evaluate_agent(self, itr):
     """Signal worker processes to perform agent evaluation.  If a max
     number of evaluation trajectories was specified, keep watch over the
     number of trajectories finished and signal an early end if the limit
     is reached.  Return a list of trajectory-info objects from the
     completed episodes.
     """
     self.ctrl.itr.value = itr
     self.ctrl.do_eval.value = True
     self.sync.stop_eval.value = False
     self.ctrl.barrier_in.wait()
     traj_infos = list()
     if self.eval_max_trajectories is not None:
         while True:
             time.sleep(EVAL_TRAJ_CHECK)
             traj_infos.extend(
                 drain_queue(self.eval_traj_infos_queue,
                             guard_sentinel=True))
             if len(traj_infos) >= self.eval_max_trajectories:
                 self.sync.stop_eval.value = True
                 logger.log("Evaluation reached max num trajectories "
                            f"({self.eval_max_trajectories}).")
                 break  # Stop possibly before workers reach max_T.
             if self.ctrl.barrier_out.parties - self.ctrl.barrier_out.n_waiting == 1:
                 logger.log("Evaluation reached max num time steps "
                            f"({self.eval_max_T}).")
                 break  # Workers reached max_T.
     self.ctrl.barrier_out.wait()
     traj_infos.extend(
         drain_queue(self.eval_traj_infos_queue, n_sentinel=self.n_worker))
     self.ctrl.do_eval.value = False
     return traj_infos
Example #10
0
 def initialize_replay_buffer(self, examples, batch_spec, async_=False):
     """
     Allocates replay buffer using examples and with the fields in `SamplesToBuffer`
     namedarraytuple.  Uses frame-wise buffers, so that only unique frames are stored,
     using less memory (usual observations are 4 most recent frames, with only newest
     frame distince from previous observation).
     """
     example_to_buffer = self.examples_to_buffer(examples)
     replay_kwargs = dict(
         example=example_to_buffer,
         size=self.replay_size,
         B=batch_spec.B,
         discount=self.discount,
         n_step_return=self.n_step_return,
     )
     if self.prioritized_replay:
         replay_kwargs.update(
             dict(
                 alpha=self.pri_alpha,
                 beta=self.pri_beta_init,
                 default_priority=self.default_priority,
             ))
         ReplayCls = (AsyncPrioritizedReplayFrameBuffer
                      if async_ else PrioritizedReplayFrameBuffer)
     else:
         ReplayCls = (AsyncUniformReplayFrameBuffer
                      if async_ else UniformReplayFrameBuffer)
     if self.ReplayBufferCls is not None:
         ReplayCls = self.ReplayBufferCls
         logger.log(
             f"WARNING: ignoring internal selection logic and using"
             f" input replay buffer class: {ReplayCls} -- compatibility not"
             " guaranteed.")
     self.replay_buffer = ReplayCls(**replay_kwargs)
Example #11
0
 def sample_mode(self, itr):
     super().sample_mode(itr)
     self.q2_model.eval()
     std = self.action_std if itr >= self.min_itr_learn else self.pretrain_std
     if itr == 0 or itr == self.min_itr_learn:
         logger.log(f"Agent at itr {itr}, sample std: {std}.")
     self.distribution.set_std(std)
Example #12
0
    def initialize(self, affinity):
        """Initialization inside the main sampler process.  Builds one level
        of parallel synchronization objects, and forks action-server processes,
        one per GPU to be used.
        """
        torch.set_num_threads(1)  # Needed to avoid MKL hang :( .
        self.world_size = n_server = len(affinity)
        n_envs_lists = self._get_n_envs_lists(affinity)
        n_server = len(n_envs_lists)
        n_worker = sum([len(n_envs_list) for n_envs_list in n_envs_lists])
        self.n_worker = n_worker

        if self.eval_n_envs > 0:
            self.eval_n_envs_per = max(1, self.eval_n_envs // n_worker)
            self.eval_n_envs = eval_n_envs = self.eval_n_envs_per * n_worker
            logger.log(f"Total parallel evaluation envs: {eval_n_envs}.")
            self.eval_max_T = eval_max_T = int(self.eval_max_steps // eval_n_envs)

        self._build_parallel_ctrl(n_server, n_worker)

        servers_kwargs = self._assemble_servers_kwargs(affinity, self.seed,
            n_envs_lists)
        servers = [mp.Process(target=self.action_server_process,
            kwargs=s_kwargs)
            for s_kwargs in servers_kwargs]
        for s in servers:
            s.start()
        self.servers = servers
        self.ctrl.barrier_out.wait()  # Wait for workers to decorrelate envs.
Example #13
0
 def initialize_logging(self):
     self._traj_infos = deque(maxlen=self.log_traj_window)
     self._cum_completed_trajs = 0
     self._new_completed_trajs = 0
     super().initialize_logging()
     logger.log(f"Optimizing over {self.log_interval_itrs} sampler "
         "iterations.")
     self.pbar = ProgBarCounter(self.log_interval_itrs)
Example #14
0
 def get_n_itr(self):
     log_interval_itrs = max(self.log_interval_steps //
         self.sampler_batch_size, 1)
     n_itr = math.ceil(self.n_steps / self.log_interval_steps) * log_interval_itrs
     self.log_interval_itrs = log_interval_itrs
     self.n_itr = n_itr
     logger.log(f"Running {n_itr} sampler iterations.")
     return n_itr
Example #15
0
 def save_itr_snapshot(self, itr):
     """
     Calls the logger to save training checkpoint/snapshot (logger itself
     may or may not save, depending on mode selected).
     """
     logger.log("saving snapshot...")
     params = self.get_itr_snapshot(itr)
     logger.save_itr_params(itr, params)
     logger.log("saved")
Example #16
0
 def _save_epoch_snapshot(self, epoch_i):
     """
     Calls the logger to save training checkpoint/snapshot (logger itself
     may or may not save, depending on mode selected).
     """
     logger.log("saving snapshot...")
     params = self._get_epoch_snapshot(epoch_i)
     logger.save_itr_params(epoch_i, params)
     logger.log("saved")
Example #17
0
 def _load_snapshot(self, filename):
     """ A method to load parameters from snapshot and keep on training
     NOTE: filename has to be absolute path. And this has to be done after
     _startup
     """
     state_dict = torch.load(filename)
     logger.log("Loading snapshot from {}".format(filename))
     self.epoch_i = state_dict["epoch_i"]
     self.agent.load_state_dict(state_dict["agent_state_dict"])
     self.algo.load_state_dict(state_dict["algo_state_dict"])
Example #18
0
 def log_diagnostics(self, itr, sampler_itr, throttle_time, prefix='Diagnostics/'):
     if not self._traj_infos:
         logger.log("WARNING: had no complete trajectories in eval.")
     steps_in_eval = sum([info["Length"] for info in self._traj_infos])
     with logger.tabular_prefix(prefix):
         logger.record_tabular('StepsInEval', steps_in_eval)
         logger.record_tabular('TrajsInEval', len(self._traj_infos))
         logger.record_tabular('CumEvalTime', self.ctrl.eval_time.value)
     super().log_diagnostics(itr, sampler_itr, throttle_time, prefix=prefix)
     self._traj_infos = list()  # Clear after each eval.
Example #19
0
    def serve_actions_evaluation(self, itr):
        obs_ready, act_ready = self.sync.obs_ready, self.sync.act_ready
        obs_ready_pair = self.obs_ready_pair
        act_ready_pair = self.act_ready_pair
        step_np_pair = self.eval_step_buffer_np_pair
        agent_inputs_pair = self.eval_agent_inputs_pair
        traj_infos = list()
        self.agent.reset()
        stop = False

        for t in range(self.eval_max_T):
            if t % EVAL_TRAJ_CHECK == 0:  # (While workers stepping.)
                traj_infos.extend(drain_queue(self.eval_traj_infos_queue,
                    guard_sentinel=True))
            for alt in range(2):
                step_h = step_np_pair[alt]
                for b in obs_ready_pair[alt]:
                    b.acquire()
                    # assert not b.acquire(block=False)  # Debug check.
                for b_reset in np.where(step_h.done)[0]:
                    step_h.action[b_reset] = 0  # Null prev_action.
                    step_h.reward[b_reset] = 0  # Null prev_reward.
                    self.agent.reset_one(idx=b_reset)
                action, agent_info = self.agent.step(*agent_inputs_pair[alt])
                step_h.action[:] = action
                step_h.agent_info[:] = agent_info
                if (self.eval_max_trajectories is not None and
                        t % EVAL_TRAJ_CHECK == 0 and alt == 0):
                    if len(traj_infos) >= self.eval_max_trajectories:
                        for b in obs_ready_pair[1 - alt]:
                            b.acquire()  # Now all workers waiting.
                        self.sync.stop_eval.value = stop = True
                        for w in act_ready[alt]:
                            w.release()
                        break
                for w in act_ready_pair[alt]:
                    # assert not w.acquire(block=False)  # Debug check.
                    w.release()
            if stop:
                logger.log("Evaluation reached max num trajectories "
                    f"({self.eval_max_trajectories}).")
                break

        # TODO: check exit logic for/while ..?
        if not stop:
            logger.log("Evaluation reached max num time steps "
                f"({self.eval_max_T}).")

        for b in obs_ready:
            b.acquire()  # Workers always do extra release; drain it.
            assert not b.acquire(block=False)  # Debug check.
        for w in act_ready:
            assert not w.acquire(block=False)  # Debug check.

        return traj_infos
Example #20
0
 def sample_mode(self, itr):
     super().sample_mode(itr)
     self.q1_model.eval()
     self.q2_model.eval()
     self.v_model.eval()
     if itr == 0:
         logger.log(f"Agent at itr {itr}, sample std: {self.pretrain_std}")
     if itr == self.min_itr_learn:
         logger.log(f"Agent at itr {itr}, sample std: learned.")
     std = None if itr >= self.min_itr_learn else self.pretrain_std
     self.distribution.set_std(std)  # If None: std from policy dist_info.
Example #21
0
 def evaluate_agent(self, itr):
     """
     Record offline evaluation of agent performance, by ``sampler.evaluate_agent()``.
     """
     if itr > 0:
         self.pbar.stop()
     logger.log("Evaluating agent...")
     self.agent.eval_mode(itr)  # Might be agent in sampler.
     eval_time = -time.time()
     traj_infos = self.sampler.evaluate_agent(itr)
     eval_time += time.time()
     logger.log("Evaluation runs complete.")
     return traj_infos, eval_time
Example #22
0
    def log_diagnostics(self,
                        itr,
                        traj_infos=None,
                        eval_time=0,
                        prefix='Diagnostics/'):
        """
        Write diagnostics (including stored ones) to csv via the logger.
        """
        if itr > 0:
            self.pbar.stop()
        self.save_itr_snapshot(itr)
        new_time = time.time()
        self._cum_time = new_time - self._start_time
        train_time_elapsed = new_time - self._last_time - eval_time
        new_updates = self.algo.update_counter - self._last_update_counter
        new_samples = (self.sampler.batch_size * self.world_size *
                       self.log_interval_itrs)
        updates_per_second = (float('nan') if itr == 0 else new_updates /
                              train_time_elapsed)
        samples_per_second = (float('nan') if itr == 0 else new_samples /
                              train_time_elapsed)
        replay_ratio = (new_updates * self.algo.batch_size * self.world_size /
                        new_samples)
        cum_replay_ratio = (self.algo.batch_size * self.algo.update_counter /
                            ((itr + 1) * self.sampler.batch_size)
                            )  # world_size cancels.
        cum_steps = (itr + 1) * self.sampler.batch_size * self.world_size

        with logger.tabular_prefix(prefix):
            if self._eval:
                logger.record_tabular(
                    'CumTrainTime', self._cum_time -
                    self._cum_eval_time)  # Already added new eval_time.
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('CumTime (s)', self._cum_time)
            logger.record_tabular('CumSteps', cum_steps)
            logger.record_tabular('CumCompletedTrajs',
                                  self._cum_completed_trajs)
            logger.record_tabular('CumUpdates', self.algo.update_counter)
            logger.record_tabular('StepsPerSecond', samples_per_second)
            logger.record_tabular('UpdatesPerSecond', updates_per_second)
            logger.record_tabular('ReplayRatio', replay_ratio)
            logger.record_tabular('CumReplayRatio', cum_replay_ratio)
        self._log_infos(traj_infos)
        logger.dump_tabular(with_prefix=False)

        self._last_time = new_time
        self._last_update_counter = self.algo.update_counter
        if itr < self.n_itr - 1:
            logger.log(f"Optimizing over {self.log_interval_itrs} iterations.")
            self.pbar = ProgBarCounter(self.log_interval_itrs)
Example #23
0
 def startup(self):
     """
     Sets hardware affinities, initializes the following: 1) sampler (which
     should initialize the agent), 2) agent device and data-parallel wrapper (if applicable),
     3) algorithm, 4) logger.
     """
     p = psutil.Process()
     try:
         if (self.affinity.get("master_cpus", None) is not None
                 and self.affinity.get("set_affinity", True)):
             p.cpu_affinity(self.affinity["master_cpus"])
         cpu_affin = p.cpu_affinity()
     except AttributeError:
         cpu_affin = "UNAVAILABLE MacOS"
     logger.log(f"Runner {getattr(self, 'rank', '')} master CPU affinity: "
                f"{cpu_affin}.")
     if self.affinity.get("master_torch_threads", None) is not None:
         torch.set_num_threads(self.affinity["master_torch_threads"])
     logger.log(f"Runner {getattr(self, 'rank', '')} master Torch threads: "
                f"{torch.get_num_threads()}.")
     if self.seed is None:
         self.seed = make_seed()
     set_seed(self.seed)
     self.rank = rank = getattr(self, "rank", 0)
     self.world_size = world_size = getattr(self, "world_size", 1)
     examples = self.sampler.initialize(
         agent=self.agent,  # Agent gets initialized in sampler.
         affinity=self.affinity,
         seed=self.seed + 1,
         bootstrap_value=getattr(self.algo, "bootstrap_value", False),
         traj_info_kwargs=self.get_traj_info_kwargs(),
         rank=rank,
         world_size=world_size,
     )
     self.itr_batch_size = self.sampler.batch_spec.size * world_size
     n_itr = self.get_n_itr()
     self.agent.to_device(self.affinity.get("cuda_idx", None))
     if world_size > 1:
         self.agent.data_parallel()
     self.algo.initialize(
         agent=self.agent,
         n_itr=n_itr,
         batch_spec=self.sampler.batch_spec,
         mid_batch_reset=self.sampler.mid_batch_reset,
         examples=examples,
         world_size=world_size,
         rank=rank,
     )
     self.initialize_logging()
     return n_itr
Example #24
0
 def log_diagnostics(self,
                     itr,
                     eval_traj_infos,
                     eval_time,
                     prefix='Diagnostics/'):
     if not eval_traj_infos:
         logger.log("WARNING: had no complete trajectories in eval.")
     steps_in_eval = sum([info["Length"] for info in eval_traj_infos])
     with logger.tabular_prefix(prefix):
         logger.record_tabular('StepsInEval', steps_in_eval)
         logger.record_tabular('TrajsInEval', len(eval_traj_infos))
         self._cum_eval_time += eval_time
         logger.record_tabular('CumEvalTime', self._cum_eval_time)
     super().log_diagnostics(itr, eval_traj_infos, eval_time, prefix=prefix)
Example #25
0
    def initialize(self, affinity):
        """Initialization inside the main sampler process.  Sets process hardware
        affinities, creates specified number of environment instances and instantiates
        the collector with them.  If applicable, does the same for evaluation
        environment instances.  Moves the agent to device (could be GPU), and 
        calls on ``agent.async_cpu()`` initialization.  Starts up collector.
        """
        p = psutil.Process()
        if affinity.get("set_affinity", True):
            p.cpu_affinity(affinity["master_cpus"])
        # torch.set_num_threads(affinity["master_torch_threads"])
        torch.set_num_threads(1)  # Needed to prevent MKL hang :( .
        B = self.batch_spec.B
        envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)]
        sync = AttrDict(
            db_idx=AttrDict(value=0))  # Mimic the mp.RawValue format.
        collector = self.CollectorCls(
            rank=0,
            envs=envs,
            samples_np=self.double_buffer,
            batch_T=self.batch_spec.T,
            TrajInfoCls=self.TrajInfoCls,
            agent=self.agent,
            sync=sync,
        )
        if self.eval_n_envs > 0:
            eval_envs = [
                self.EnvCls(**self.eval_env_kwargs)
                for _ in range(self.eval_n_envs)
            ]
            eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector
            self.eval_collector = eval_CollectorCls(
                envs=eval_envs,
                agent=self.agent,
                TrajInfoCls=self.TrajInfoCls,
                max_T=self.eval_max_steps // self.eval_n_envs,
                max_trajectories=self.eval_max_trajectories,
            )
        self.agent.to_device(cuda_idx=affinity.get("cuda_idx", None))
        self.agent.async_cpu(share_memory=False)

        agent_inputs, traj_infos = collector.start_envs(
            self.max_decorrelation_steps)
        collector.start_agent()

        self.collector = collector
        self.agent_inputs = agent_inputs
        self.traj_infos = traj_infos
        self.sync = sync
        logger.log("Serial sampler initialized.")
Example #26
0
 def get_n_itr(self):
     """
     Determine number of train loop iterations to run.  Converts logging
     interval units from environment steps to iterations.
     """
     # Log at least as often as requested (round down itrs):
     log_interval_itrs = max(self.log_interval_steps // self.itr_batch_size,
                             1)
     # FIXME: To run at least as many steps as requested, round up log interval?
     n_itr = math.ceil(
         self.n_steps / self.log_interval_steps) * log_interval_itrs
     self.log_interval_itrs = log_interval_itrs
     self.n_itr = n_itr
     logger.log(f"Running {n_itr} iterations of minibatch RL.")
     return n_itr
Example #27
0
    def log_diagnostics(self, itr, sampler_itr, throttle_time, prefix='Diagnostics/'):
        self.pbar.stop()
        self.save_itr_snapshot(itr, sampler_itr)
        new_time = time.time()
        time_elapsed = new_time - self._last_time
        new_updates = self.algo.update_counter - self._last_update_counter
        new_samples = self.sampler.batch_size * (sampler_itr - self._last_sampler_itr)
        updates_per_second = (float('nan') if itr == 0 else
            new_updates / time_elapsed)
        samples_per_second = (float('nan') if itr == 0 else
            new_samples / time_elapsed)
        if self._eval:
            new_eval_time = self.ctrl.eval_time.value
            eval_time_elapsed = new_eval_time - self._last_eval_time
            non_eval_time_elapsed = time_elapsed - eval_time_elapsed
            non_eval_samples_per_second = (float('nan') if itr == 0 else
                new_samples / non_eval_time_elapsed)
            self._last_eval_time = new_eval_time
        cum_steps = sampler_itr * self.sampler.batch_size  # No * world_size.
        replay_ratio = (new_updates * self.algo.batch_size * self.world_size /
            max(1, new_samples))
        cum_replay_ratio = (self.algo.update_counter * self.algo.batch_size *
            self.world_size / max(1, cum_steps))

        with logger.tabular_prefix(prefix):
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('SamplerIteration', sampler_itr)
            logger.record_tabular('CumTime (s)', new_time - self._start_time)
            logger.record_tabular('CumSteps', cum_steps)
            logger.record_tabular('CumUpdates', self.algo.update_counter)
            logger.record_tabular('ReplayRatio', replay_ratio)
            logger.record_tabular('CumReplayRatio', cum_replay_ratio)
            logger.record_tabular('StepsPerSecond', samples_per_second)
            if self._eval:
                logger.record_tabular('NonEvalSamplesPerSecond', non_eval_samples_per_second)
            logger.record_tabular('UpdatesPerSecond', updates_per_second)
            logger.record_tabular('OptThrottle', (time_elapsed - throttle_time) /
                time_elapsed)

        self._log_infos()
        self._last_time = new_time
        self._last_itr = itr
        self._last_sampler_itr = sampler_itr
        self._last_update_counter = self.algo.update_counter
        logger.dump_tabular(with_prefix=False)
        logger.log(f"Optimizing over {self.log_interval_itrs} sampler "
            "iterations.")
        self.pbar = ProgBarCounter(self.log_interval_itrs)
Example #28
0
def memory_copier(sample_buffer, samples_to_buffer, replay_buffer, ctrl):
    # Needed on some systems to avoid mysterious hang.
    # (Experienced hang on Ubuntu Server 16.04 machines (but not Desktop) when
    # appending samples to make replay buffer full, but only for batch_B > 84
    # (dqn + r2d1 atari), regardless of replay size or batch_T.  Would seem to
    # progress through all code in replay.append_samples() but simply would
    # not return from it.  Some tipping point for MKL threading?)
    torch.set_num_threads(1)
    while True:
        ctrl.sample_ready.acquire()
        # assert not ctrl.sample_ready.acquire(block=False)  # Debug check.
        if ctrl.quit.value:
            break
        replay_buffer.append_samples(samples_to_buffer(sample_buffer))
        ctrl.sample_copied.release()
    logger.log("Memory copier shutting down.")
Example #29
0
def run_async_sampler(sampler, affinity, ctrl, traj_infos_queue, n_itr):
    sampler.initialize(affinity)
    db_idx = 0
    for itr in range(n_itr):
        ctrl.sample_copied[db_idx].acquire()
        traj_infos = sampler.obtain_samples(itr, db_idx)
        ctrl.sample_ready[db_idx].release()
        with ctrl.sampler_itr.get_lock():
            for traj_info in traj_infos:
                traj_infos_queue.put(traj_info)
            ctrl.sampler_itr.value = itr
        db_idx ^= 1  # Double buffer.
    logger.log(f"Async sampler reached final itr: {itr + 1}, quitting.")
    ctrl.quit.value = True  # This ends the experiment.
    sampler.shutdown()
    for s in ctrl.sample_ready:
        s.release()  # Let memcpy workers finish and quit.
Example #30
0
 def initialize_replay_buffer(self, examples, batch_spec, async_=False):
     """Similar to DQN but uses replay buffers which return sequences, and
     stores the agent's recurrent state."""
     example_to_buffer = SamplesToBuffer(
         observation=examples["observation"],
         action=examples["action"],
         reward=examples["reward"],
         done=examples["done"],
     )
     if self.store_rnn_state_interval > 0:
         example_to_buffer = SamplesToBufferRnn(
             *example_to_buffer,
             prev_rnn_state=examples["agent_info"].prev_rnn_state,
         )
     replay_kwargs = dict(
         example=example_to_buffer,
         size=self.replay_size,
         B=batch_spec.B,
         discount=self.discount,
         n_step_return=self.n_step_return,
         rnn_state_interval=self.store_rnn_state_interval,
         # batch_T fixed for prioritized, (relax if rnn_state_interval=1 or 0).
         batch_T=self.batch_T + self.warmup_T,
     )
     if self.prioritized_replay:
         replay_kwargs.update(
             dict(
                 alpha=self.pri_alpha,
                 beta=self.pri_beta_init,
                 default_priority=self.default_priority,
                 input_priorities=self.input_priorities,  # True/False.
                 input_priority_shift=self.input_priority_shift,
             ))
         ReplayCls = (AsyncPrioritizedSequenceReplayFrameBuffer
                      if async_ else PrioritizedSequenceReplayFrameBuffer)
     else:
         ReplayCls = (AsyncUniformSequenceReplayFrameBuffer
                      if async_ else UniformSequenceReplayFrameBuffer)
     if self.ReplayBufferCls is not None:
         ReplayCls = self.ReplayBufferCls
         logger.log(
             f"WARNING: ignoring internal selection logic and using"
             f" input replay buffer class: {ReplayCls} -- compatibility not"
             " guaranteed.")
     self.replay_buffer = ReplayCls(**replay_kwargs)
     return self.replay_buffer