def __init__( self, _agent: AgentAbstract, _env: EnvAbstract, _epoch_max: int, _epoch_train: int, _train_update_target: int, _train_save: int, _process_core: int = None, _save_path: str = './save', _use_cmd: bool = True, ): self.agent: AgentAbstract = _agent self.agent.training() self.env: EnvAbstract = _env # multiprocessing for sampling self.mp = mp.get_context('spawn') self.process_core = _process_core self.pool = self.mp.Pool(self.process_core) # training control self.epoch = 0 self.train_times = 0 self.epoch_max = _epoch_max self.epoch_train = _epoch_train self.train_update_target = _train_update_target self.train_save = _train_save self.total_reward_buf = [] self.save_path = _save_path self.use_cmd = _use_cmd if self.use_cmd: self.shell = TrainShell(self)
def mp_search(self, graph, other_info, model_id, train_data, test_data): ctx = mp.get_context() q = ctx.Queue() p = ctx.Process(target=train, args=(q, graph, train_data, test_data, self.trainer_args, self.metric, self.loss, self.verbose, self.path)) try: p.start() search_results = self._search_common(q) metric_value, loss, graph = q.get(block=True) if time.time() >= self._timeout: raise TimeoutError if self.verbose and search_results: for (generated_graph, generated_other_info, new_model_id) in search_results: verbose_print(generated_other_info, generated_graph, new_model_id) if metric_value is not None: self.add_model(metric_value, loss, graph, model_id) self.update(other_info, model_id, graph, metric_value) except (TimeoutError, queue.Empty) as e: raise TimeoutError from e finally: # terminate and join the subprocess to prevent any resource leak p.terminate() p.join()
def test_cuda_small_tensors(self): # Check multiple small tensors which will likely use the same # underlying cached allocation ctx = mp.get_context('spawn') tensors = [] for i in range(5): device = i % 2 tensors += [torch.arange(i * 5, (i + 1) * 5).cuda(device)] inq = ctx.Queue() outq = ctx.Queue() inq.put(tensors) p = ctx.Process(target=sum_tensors, args=(inq, outq)) p.start() results = [] for i in range(5): results.append(outq.get()) p.join() for i, tensor in enumerate(tensors): v, device, tensor_size, storage_size = results[i] self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum()) self.assertEqual(device, i % 2) self.assertEqual(tensor_size, 5) self.assertEqual(storage_size, 5)
def __init__(self, device_ids=None, multiprocessing_method='spawn'): super().__init__() self.device_ids = tuple(device_ids) self.num_replicas = len(device_ids) self.rank = None self._mp = multiprocessing.get_context(multiprocessing_method) self._start_error_handler() self._start_multiprocessing()
def __init__(self, loader, prepro, sort_key, batchify, single_run=True, queue_size=8, fork=True): self._loader = loader self._prepro = prepro self._sort_key = sort_key self._batchify = batchify self._single_run = single_run if fork: ctx = mp.get_context('forkserver') self._queue = ctx.Queue(queue_size) else: # for easier debugging self._queue = None self._process = None
def test_event(self): ctx = mp.get_context('spawn') queue = ctx.Queue() ready = ctx.Event() done = ctx.Event() p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done)) p.start() ready.wait() with torch.cuda.stream(torch.cuda.Stream()): tensor = torch.cuda.FloatTensor([1, 1, 1, 1]) # Use a sleep kernel to test events. Without the event, the # multiply happens before the add. event = torch.cuda.Event(interprocess=True) torch.cuda._sleep(20000000) # about 30 ms tensor.add_(1) event.record() queue.put((event, tensor)) done.wait() # must wait until subprocess records event event.synchronize() self.assertEqual(list(tensor), [4, 4, 4, 4]) p.join()
def __call__(self, batch_size: int): def get_batches(hyper_batch): indexes = list(range(0, len(hyper_batch), batch_size)) if not self._single_run: # random shuffle for training batches random.shuffle(hyper_batch) random.shuffle(indexes) hyper_batch.sort(key=self._sort_key) for i in indexes: batch = self._batchify(hyper_batch[i:i+batch_size]) yield batch if self._queue is not None: ctx = mp.get_context('forkserver') self._process = ctx.Process( target=_batch2q, args=(self._loader, self._prepro, self._queue, self._single_run) ) self._process.start() while True: d = self._queue.get() if d is None: break if isinstance(d, int): print('\nepoch {} done'.format(d)) continue yield from get_batches(d) self._process.join() else: i = 0 while True: for batch in self._loader: yield from get_batches(self._prepro(batch)) if self._single_run: break i += 1 print('\nepoch {} done'.format(i))
def setup(self, model): os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) # pass in a state q smp = mp.get_context("spawn") self.mp_queue = smp.SimpleQueue()
def __init__( self, make_sampler_fn: Callable[..., TaskSampler], sampler_fn_args: Sequence[Dict[str, Any]] = None, auto_resample_when_done: bool = True, multiprocessing_start_method: Optional[str] = "forkserver", mp_ctx: Optional[BaseContext] = None, metrics_out_queue: mp.Queue = None, should_log: bool = True, max_processes: Optional[int] = None, ) -> None: self._is_waiting = False self._is_closed = True self.should_log = should_log self.max_processes = max_processes assert ( sampler_fn_args is not None and len(sampler_fn_args) > 0 ), "number of processes to be created should be greater than 0" self._num_task_samplers = len(sampler_fn_args) self._num_processes = ( self._num_task_samplers if max_processes is None else min(max_processes, self._num_task_samplers) ) self._auto_resample_when_done = auto_resample_when_done assert (multiprocessing_start_method is None) != ( mp_ctx is None ), "Exactly one of `multiprocessing_start_method`, and `mp_ctx` must be not None." if multiprocessing_start_method is not None: assert multiprocessing_start_method in self._valid_start_methods, ( "multiprocessing_start_method must be one of {}. Got '{}'" ).format(self._valid_start_methods, multiprocessing_start_method) self._mp_ctx = mp.get_context(multiprocessing_start_method) else: self._mp_ctx = typing.cast(BaseContext, mp_ctx) self.metrics_out_queue = metrics_out_queue or self._mp_ctx.Queue() self.npaused_per_process = [0] * self._num_processes self.sampler_index_to_process_ind_and_subprocess_ind: Optional[ List[List[int]] ] = None self._reset_sampler_index_to_process_ind_and_subprocess_ind() self._workers: Optional[List] = None for args in sampler_fn_args: args["mp_ctx"] = self._mp_ctx ( self._connection_read_fns, self._connection_write_fns, ) = self._spawn_workers( # noqa make_sampler_fn=make_sampler_fn, sampler_fn_args_list=[ args_list for args_list in self._partition_to_processes(sampler_fn_args) ], ) self._is_closed = False for write_fn in self._connection_write_fns: write_fn((OBSERVATION_SPACE_COMMAND, None)) observation_spaces = [ space for read_fn in self._connection_read_fns for space in read_fn() ] if any(os is None for os in observation_spaces): raise NotImplementedError( "It appears that the `all_observation_spaces_equal`" " is not True for some task sampler created by" " VectorSampledTasks. This is not currently supported." ) if any(observation_spaces[0] != os for os in observation_spaces): raise NotImplementedError( "It appears that the observation spaces of the samplers" " created in VectorSampledTasks are not equal." " This is not currently supported." ) self.observation_space = observation_spaces[0] for write_fn in self._connection_write_fns: write_fn((ACTION_SPACE_COMMAND, None)) self.action_spaces = [ space for read_fn in self._connection_read_fns for space in read_fn() ]
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e', '--epochs', default='last', type=str) parser.add_argument('-d', '--devices', default='1', type=str) parser.add_argument('-v', '--verbose', default=False, action='store_true') parser.add_argument('--show_image', '-s', default=False, action='store_true') parser.add_argument('--save_path', '-p', default=None) args = parser.parse_args() all_dev = parse_devices(args.devices) mp_ctx = mp.get_context('spawn') network = CPNet(config.num_classes, criterion=None) data_setting = { 'img_root': config.img_root_folder, 'gt_root': config.gt_root_folder, 'train_source': config.train_source, 'eval_source': config.eval_source } dataset = ADE(data_setting, 'val', None) with torch.no_grad(): segmentor = SegEvaluator(dataset, config.num_classes, config.image_mean, config.image_std, network, config.eval_scale_array, config.eval_flip, all_dev, args.verbose, args.save_path, args.show_image)
import torch from torch import nn from torch import optim from torch import autograd from torch import multiprocessing from torch.autograd import Variable from torch.nn import functional as F from torch.utils.data import TensorDataset import numpy as np import os multiprocessing = multiprocessing.get_context('spawn') class Concept(nn.Module): def __init__(self): super(Concept, self).__init__() self.net = nn.Sequential( nn.Conv2d(3, 64, kernel_size=3, padding=0), nn.BatchNorm2d(64, momentum=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2), nn.Conv2d(64, 64, kernel_size=3, padding=0), nn.BatchNorm2d(64, momentum=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2), nn.Conv2d(64, 64, kernel_size=3, padding=1), nn.BatchNorm2d(64, momentum=1), nn.ReLU(inplace=True), nn.Conv2d(64, 64, kernel_size=3, padding=1),
def train(self): if self.multiprocessing and self.view_id < 0: import torch.multiprocessing as mp mp = mp.get_context('spawn') process = [] start = time.time() for v in range(self.n_view): process.append(mp.Process(target=self.train_view, args=(v, ))) process[v].daemon = True process[v].start() for p in process: p.join() elif self.view_id >= 0: start = time.time() self.train_view(self.view_id) else: start = time.time() for v in range(self.n_view): self.train_view(v) end = time.time() runing_time = end - start if self.runing_time: print('runing_time: ' + str(runing_time)) return runing_time if not self.just_valid: # valid = [self.resutls[v][0] for v in range(self.n_view)] # test = [self.resutls[v][1] for v in range(self.n_view)] valid_fea, valid_lab, test_fea, test_lab = [], [], [], [] for v in range(self.n_view): tmp = sio.loadmat('features/' + self.datasets + '_' + str(v) + '.mat') valid_fea.append(tmp['valid_fea']) valid_lab.append(tmp['valid_lab'].reshape([ -1, ]) if min(tmp['valid_lab'].shape) == 1 else tmp['valid_lab']) test_fea.append(tmp['test_fea']) test_lab.append(tmp['test_lab'].reshape([ -1, ]) if min(tmp['test_lab'].shape) == 1 else tmp['test_lab']) valid_results = utils.multi_test(valid_fea, valid_lab, self.MAP) test_results = utils.multi_test(test_fea, test_lab, self.MAP) print("valid results: " + self.view_result(valid_results) + ",\t test resutls:" + self.view_result(test_results)) sio.savemat( 'features/' + self.datasets + '_SDML_test_feature_results.mat', { 'test': test_fea, 'test_labels': test_lab }) return valid_results, test_results else: return np.concatenate( [np.array(loss).reshape([1, -1]) for loss in self.val_d_loss], axis=0), np.concatenate([ np.array(loss).reshape([1, -1]) for loss in self.tr_d_loss ], axis=0), np.concatenate([ np.array(loss).reshape([1, -1]) for loss in self.tr_ae_loss ], axis=0)
def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): r"""Applies each `module` in :attr:`modules` in parallel on arguments contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword) on each of :attr:`devices`. Args: modules (Module): modules to be parallelized inputs (tensor): inputs to the modules devices (list of int or torch.device): CUDA devices :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and :attr:`devices` (if given) should all have same length. Moreover, each element of :attr:`inputs` can either be a single object as the only argument to a module, or a collection of positional arguments. """ assert len(modules) == len(inputs) if kwargs_tup is not None: assert len(modules) == len(kwargs_tup) else: kwargs_tup = ({}, ) * len(modules) if devices is not None: assert len(modules) == len(devices) else: devices = [None] * len(modules) devices = list(map(lambda x: _get_device_index(x, True), devices)) context = mp.get_context('spawn') # lock = threading.Lock() # results = {} # results = [] results_queue = context.Queue(len(devices)) grad_enabled = torch.is_grad_enabled() def _worker(i, module, input, kwargs, device=None): torch.set_grad_enabled(grad_enabled) if device is None: device = get_a_var(input).get_device() try: with torch.cuda.device(device): # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): input = (input, ) output = module(*input, **kwargs) results_queue.put(output) # with lock: # results[i] = output except Exception as e: results_queue.put(e) # with lock: # results[i] = e if len(modules) > 1: processes = [ context.Process(target=_worker, args=(i, module, input, kwargs, device)) for i, ( module, input, kwargs, device) in enumerate(zip(modules, inputs, kwargs_tup, devices)) ] for process in processes: process.start() for process in processes: process.join() else: _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0]) outputs = [] for i in range(len(inputs)): output = results_queue.get() if isinstance(output, Exception): raise output outputs.append(output) return outputs
def test_cuda_parameter_sharing(self): param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5)) self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
def test_cuda(self): torch.cuda.FloatTensor([1]) # initialize CUDA outside of leak checker self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
def train(flags): # pylint: disable=too-many-branches, too-many-statements if flags.xpid is None: flags.xpid = 'torchbeast-%s' % time.strftime('%Y%m%d-%H%M%S') plogger = file_writer.FileWriter( xpid=flags.xpid, xp_args=flags.__dict__, rootdir=flags.savedir, symlink_latest=False, ) checkpointpath = os.path.expandvars( os.path.expanduser('%s/%s/%s' % (flags.savedir, flags.xpid, 'model.tar'))) T = flags.unroll_length B = flags.batch_size flags.device = None if not flags.disable_cuda and torch.cuda.is_available(): logging.info(f'Using CUDA {flags.cuda_id}') flags.device = torch.device(f'cuda:{flags.cuda_id}') else: logging.info('Not using CUDA.') flags.device = torch.device('cpu') env = Net.create_env(flags) model = Net.make(flags, env) buffers = create_buffers(env.observation_space, len(env.action_space), flags) model.share_memory() actor_processes = [] ctx = mp.get_context('fork') free_queue = ctx.SimpleQueue() full_queue = ctx.SimpleQueue() for i in range(flags.num_actors): actor = ctx.Process(target=act, args=(i, free_queue, full_queue, model, buffers, flags)) actor.start() actor_processes.append(actor) learner_model = Net.make(flags, env).to(device=flags.device) optimizer = torch.optim.RMSprop(learner_model.parameters(), lr=flags.learning_rate, momentum=flags.momentum, eps=flags.epsilon, alpha=flags.alpha) def lr_lambda(epoch): return 1 - min(epoch * T * B, flags.total_frames) / flags.total_frames scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) if flags.resume: save = torch.load(flags.resume, map_location='cpu') learner_model.load_state_dict(save['model_state_dict']) optimizer.load_state_dict(save['optimizer_state_dict']) if flags.resume_scheduler: scheduler.load_state_dict(save['scheduler_state_dict']) # tune only the embedding layer if flags.resume_strategy == 'emb': keep = [] for group in optimizer.param_groups: if group['params'][0].size() == (len(learner_model.vocab), flags.demb): keep.append(group) optimizer.param_groups = keep logger = logging.getLogger('logfile') stat_keys = [ 'total_loss', 'mean_episode_return', 'pg_loss', 'baseline_loss', 'entropy_loss', 'aux_loss', 'mean_win_rate', 'mean_episode_len', ] logger.info('# Step\t%s', '\t'.join(stat_keys)) frames, stats = 0, {} def batch_and_learn(i, lock=threading.Lock()): """Thread target for the learning process.""" nonlocal frames, stats timings = prof.Timings() while frames < flags.total_frames: timings.reset() batch = get_batch(free_queue, full_queue, buffers, flags, timings) tmp_t1 = time.time() stats = learn(model, learner_model, batch, optimizer, scheduler, flags) tmp_t2 = time.time() print('learn time: ', tmp_t2 - tmp_t1) timings.time('learn') with lock: to_log = dict(frames=frames) to_log.update({k: stats[k] for k in stat_keys}) plogger.log(to_log) frames += T * B print('learned frames: ', frames) if i == 0: logging.info('Batch and learn: %s', timings.summary()) for m in range(flags.num_buffers): free_queue.put(m) threads = [] for i in range(flags.num_threads): thread = threading.Thread(target=batch_and_learn, name='batch-and-learn-%d' % i, args=(i, )) thread.start() threads.append(thread) def checkpoint(): if flags.disable_checkpoint: return logging.info('Saving checkpoint to %s', checkpointpath) torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'flags': vars(flags), }, checkpointpath) timer = timeit.default_timer try: last_checkpoint_time = timer() while frames < flags.total_frames: start_frames = frames start_time = timer() time.sleep(5) if timer() - last_checkpoint_time > 10 * 60: # Save every 10 min. checkpoint() last_checkpoint_time = timer() fps = (frames - start_frames) / (timer() - start_time) if stats.get('episode_returns', None): mean_return = 'Return per episode: %.1f. ' % stats[ 'mean_episode_return'] else: mean_return = '' total_loss = stats.get('total_loss', float('inf')) logging.info('After %i frames: loss %f @ %.1f fps. %sStats:\n%s', frames, total_loss, fps, mean_return, pprint.pformat(stats)) except KeyboardInterrupt: return # Try joining actors then quit. else: for thread in threads: thread.join() logging.info('Learning finished after %d frames.', frames) finally: for _ in range(flags.num_actors): free_queue.put(None) for actor in actor_processes: actor.join(timeout=1) checkpoint() plogger.close()
def train(flags): # pylint: disable=too-many-branches, too-many-statements if flags.xpid is None: flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S") plogger = file_writer.FileWriter(xpid=flags.xpid, xp_args=flags.__dict__, rootdir=flags.savedir) checkpointpath = os.path.expandvars( os.path.expanduser("%s/%s/%s" % (flags.savedir, flags.xpid, "model.tar"))) if flags.num_buffers is None: # Set sensible default for num_buffers. flags.num_buffers = max(2 * flags.num_actors, flags.batch_size) if flags.num_actors >= flags.num_buffers: raise ValueError("num_buffers should be larger than num_actors") if flags.num_buffers < flags.batch_size: raise ValueError("num_buffers should be larger than batch_size") T = flags.unroll_length B = flags.batch_size flags.device = None if not flags.disable_cuda and torch.cuda.is_available(): logging.info("Using CUDA.") flags.device = torch.device("cuda") else: logging.info("Not using CUDA.") flags.device = torch.device("cpu") env = create_env(flags) Net = get_model(flags) model = Net(env.observation_space.shape, env.action_space, flags.use_lstm) def get_action_shape(flags, env: gym.envs) -> typing.Tuple[int]: # return the action shape as a tuple if flags.env_type == 'mujoco': return env.action_space.shape if not flags.env_type == 'atari': raise ValueError("Unknown env_type") return (env.action_space.n, ) action_shape = get_action_shape(flags, env) buffers = create_buffers(flags, env.observation_space.shape, action_shape, get_num_agents(flags)) model.share_memory() # Add initial RNN state. initial_agent_state_buffers = [] for _ in range(flags.num_buffers): state = model.initial_state(batch_size=1) for t in state: t.share_memory_() initial_agent_state_buffers.append(state) actor_processes = [] ctx = mp.get_context("fork") free_queue = ctx.SimpleQueue() full_queue = ctx.SimpleQueue() for i in range(flags.num_actors): actor = ctx.Process( target=act, args=( flags, i, free_queue, full_queue, model, buffers, initial_agent_state_buffers, ), ) actor.start() actor_processes.append(actor) Net = get_model(flags) learner_model = Net(env.observation_space.shape, env.action_space, flags.use_lstm).to(device=flags.device) optimizer = torch.optim.RMSprop( learner_model.parameters(), lr=flags.learning_rate, momentum=flags.momentum, eps=flags.epsilon, alpha=flags.alpha, ) def lr_lambda(epoch): return 1 - min(epoch * T * B, flags.total_steps) / flags.total_steps scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) logger = logging.getLogger("logfile") stat_keys = [ "total_loss", "mean_episode_return", "pg_loss", "baseline_loss", "entropy_loss", ] logger.info("# Step\t%s", "\t".join(stat_keys)) step, stats = 0, {} def batch_and_learn(i, lock=threading.Lock()): """Thread target for the learning process.""" nonlocal step, stats timings = prof.Timings() while step < flags.total_steps: timings.reset() batch, agent_state = get_batch( flags, free_queue, full_queue, buffers, initial_agent_state_buffers, timings, ) stats = learn(flags, model, learner_model, batch, agent_state, optimizer, scheduler) timings.time("learn") with lock: to_log = dict(step=step) to_log.update({k: stats[k] for k in stat_keys}) plogger.log(to_log) step += T * B if i == 0: logging.info("Batch and learn: %s", timings.summary()) for m in range(flags.num_buffers): free_queue.put(m) threads = [] for i in range(flags.num_learner_threads): thread = threading.Thread(target=batch_and_learn, name="batch-and-learn-%d" % i, args=(i, )) thread.start() threads.append(thread) def checkpoint(): if flags.disable_checkpoint: return logging.info("Saving checkpoint to %s", checkpointpath) torch.save( { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "flags": vars(flags), }, checkpointpath, ) timer = timeit.default_timer try: last_checkpoint_time = timer() while step < flags.total_steps: start_step = step start_time = timer() time.sleep(5) if timer() - last_checkpoint_time > 10 * 60: # Save every 10 min. checkpoint() last_checkpoint_time = timer() sps = (step - start_step) / (timer() - start_time) if stats.get("episode_returns", None): mean_return = ("Return per episode: %.1f. " % stats["mean_episode_return"]) else: mean_return = "" total_loss = stats.get("total_loss", float("inf")) logging.info( "Steps %i @ %.1f SPS. Loss %f. %sStats:\n%s", step, sps, total_loss, mean_return, pprint.pformat(stats), ) # for mujoco, need to manually print... print(f"sps is: {sps}") print(pprint.pformat(stats)) except KeyboardInterrupt: return # Try joining actors then quit. else: for thread in threads: thread.join() logging.info("Learning finished after %d steps.", step) finally: for _ in range(flags.num_actors): free_queue.put(None) for actor in actor_processes: actor.join(timeout=1) checkpoint() plogger.close()
if reg_scores is not None: reg_scores = reg_scores.view(-1, num_class, 2) reg_scores[:, :, 0] = reg_scores[:, :, 0] * stats[1, 0] + stats[0, 0] reg_scores[:, :, 1] = reg_scores[:, :, 1] * stats[1, 1] + stats[0, 1] # perform stpp on scores result_queue.put((dataset.video_list[index].id, rel_props.numpy(), act_scores.cpu().numpy(), comp_scores.cpu().numpy(), reg_scores.cpu().numpy(), 0)) if __name__ == '__main__': ctx = multiprocessing.get_context( 'spawn' ) # this is crucial to using multiprocessing processes with PyTorch # This net is used to provides setup settings. It is not used for testing. checkpoint = torch.load(args.weights, map_location=lambda storage, loc: storage.cuda(0)) print("model epoch {} loss: {}".format(checkpoint['epoch'], checkpoint['best_loss'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } stats = checkpoint['reg_stats'].cpu().numpy()
class GPUProcess(mp.get_context("spawn").Process): def __init__( self, gpu_idx: GPURank, subprocess_init: Optional[Callable[[], None]] = None, embedding_storage_freelist: Optional[Set[torch.FloatStorage]] = None, ) -> None: super().__init__(daemon=True, name=f"GPU #{gpu_idx}") self.gpu_idx = gpu_idx self.master_endpoint, self.worker_endpoint = mp.get_context("spawn").Pipe() self.subprocess_init = subprocess_init self.sub_holder: Dict[ Tuple[EntityName, Partition, SubPartition], Tuple[torch.nn.Parameter, RowAdagrad], ] = {} self.embedding_storage_freelist = embedding_storage_freelist @property def my_device(self) -> torch.device: return torch.device("cuda", index=self.gpu_idx) def run(self) -> None: torch.set_num_threads(1) torch.cuda.set_device(self.my_device) if self.subprocess_init is not None: self.subprocess_init() self.master_endpoint.close() for s in self.embedding_storage_freelist: assert s.is_shared() cudart = torch.cuda.cudart() res = cudart.cudaHostRegister(s.data_ptr(), s.size() * s.element_size(), 0) torch.cuda.check_error(res) assert s.is_pinned() logger.info(f"GPU subprocess {self.gpu_idx} up and running") while True: try: job: SubprocessArgs = self.worker_endpoint.recv() except EOFError: break stats = self.do_one_job( lhs_types=job.lhs_types, rhs_types=job.rhs_types, lhs_part=job.lhs_part, rhs_part=job.rhs_part, lhs_subpart=job.lhs_subpart, rhs_subpart=job.rhs_subpart, next_lhs_subpart=job.next_lhs_subpart, next_rhs_subpart=job.next_rhs_subpart, model=job.model, trainer=job.trainer, all_embs=job.all_embs, subpart_slices=job.subpart_slices, subbuckets=job.subbuckets, batch_size=job.batch_size, lr=job.lr, ) self.worker_endpoint.send( SubprocessReturn(gpu_idx=self.gpu_idx, stats=stats) ) def do_one_job( # noqa self, lhs_types: Set[str], rhs_types: Set[str], lhs_part: Partition, rhs_part: Partition, lhs_subpart: SubPartition, rhs_subpart: SubPartition, next_lhs_subpart: Optional[SubPartition], next_rhs_subpart: Optional[SubPartition], model: MultiRelationEmbedder, trainer: Trainer, all_embs: Dict[Tuple[EntityName, Partition], FloatTensorType], subpart_slices: Dict[Tuple[EntityName, Partition, SubPartition], slice], subbuckets: Dict[ Tuple[int, int], Tuple[LongTensorType, LongTensorType, LongTensorType] ], batch_size: int, lr: float, ) -> Stats: tk = TimeKeeper() for embeddings in all_embs.values(): assert embeddings.is_pinned() occurrences: Dict[ Tuple[EntityName, Partition, SubPartition], Set[Side] ] = defaultdict(set) for entity_name in lhs_types: occurrences[entity_name, lhs_part, lhs_subpart].add(Side.LHS) for entity_name in rhs_types: occurrences[entity_name, rhs_part, rhs_subpart].add(Side.RHS) if lhs_part != rhs_part: # Bipartite assert all(len(v) == 1 for v in occurrences.values()) tk.start("copy_to_device") for entity_name, part, subpart in occurrences.keys(): if (entity_name, part, subpart) in self.sub_holder: continue embeddings = all_embs[entity_name, part] optimizer = trainer.partitioned_optimizers[entity_name, part] subpart_slice = subpart_slices[entity_name, part, subpart] # TODO have two permanent storages on GPU and move stuff in and out # from them # logger.info(f"GPU #{self.gpu_idx} allocating {(subpart_slice.stop - subpart_slice.start) * embeddings.shape[1] * 4:,} bytes") gpu_embeddings = torch.empty( (subpart_slice.stop - subpart_slice.start, embeddings.shape[1]), dtype=torch.float32, device=self.my_device, ) gpu_embeddings.copy_(embeddings[subpart_slice], non_blocking=True) gpu_embeddings = torch.nn.Parameter(gpu_embeddings) gpu_optimizer = RowAdagrad([gpu_embeddings], lr=lr) (cpu_state,) = optimizer.state.values() (gpu_state,) = gpu_optimizer.state.values() # logger.info(f"GPU #{self.gpu_idx} allocating {(subpart_slice.stop - subpart_slice.start) * 4:,} bytes") gpu_state["sum"].copy_(cpu_state["sum"][subpart_slice], non_blocking=True) self.sub_holder[entity_name, part, subpart] = ( gpu_embeddings, gpu_optimizer, ) logger.debug( f"Time spent copying subparts to GPU: {tk.stop('copy_to_device'):.4f} s" ) for ( (entity_name, part, subpart), (gpu_embeddings, gpu_optimizer), ) in self.sub_holder.items(): for side in occurrences[entity_name, part, subpart]: model.set_embeddings(entity_name, side, gpu_embeddings) trainer.partitioned_optimizers[ entity_name, part, subpart ] = gpu_optimizer tk.start("translate_edges") num_edges = subbuckets[lhs_subpart, rhs_subpart][0].shape[0] edge_perm = torch.randperm(num_edges) edges_lhs, edges_rhs, edges_rel = subbuckets[lhs_subpart, rhs_subpart] _C.shuffle(edges_lhs, edge_perm, os.cpu_count()) _C.shuffle(edges_rhs, edge_perm, os.cpu_count()) _C.shuffle(edges_rel, edge_perm, os.cpu_count()) assert edges_lhs.is_pinned() assert edges_rhs.is_pinned() assert edges_rel.is_pinned() gpu_edges = EdgeList( EntityList.from_tensor(edges_lhs), EntityList.from_tensor(edges_rhs), edges_rel, ).to(self.my_device, non_blocking=True) logger.debug(f"GPU #{self.gpu_idx} got {num_edges} edges") logger.debug( f"Time spent copying edges to GPU: {tk.stop('translate_edges'):.4f} s" ) tk.start("processing") stats = process_in_batches( batch_size=batch_size, model=model, batch_processor=trainer, edges=gpu_edges ) logger.debug(f"Time spent processing: {tk.stop('processing'):.4f} s") next_occurrences: Dict[ Tuple[EntityName, Partition, SubPartition], Set[Side] ] = defaultdict(set) if next_lhs_subpart is not None: for entity_name in lhs_types: next_occurrences[entity_name, lhs_part, next_lhs_subpart].add(Side.LHS) if next_rhs_subpart is not None: for entity_name in rhs_types: next_occurrences[entity_name, rhs_part, next_rhs_subpart].add(Side.RHS) tk.start("copy_from_device") for (entity_name, part, subpart), (gpu_embeddings, gpu_optimizer) in list( self.sub_holder.items() ): if (entity_name, part, subpart) in next_occurrences: continue embeddings = all_embs[entity_name, part] optimizer = trainer.partitioned_optimizers[entity_name, part] subpart_slice = subpart_slices[entity_name, part, subpart] embeddings[subpart_slice].data.copy_( gpu_embeddings.detach(), non_blocking=True ) del gpu_embeddings (cpu_state,) = optimizer.state.values() (gpu_state,) = gpu_optimizer.state.values() cpu_state["sum"][subpart_slice].copy_(gpu_state["sum"], non_blocking=True) del gpu_state["sum"] del self.sub_holder[entity_name, part, subpart] logger.debug( f"Time spent copying subparts from GPU: {tk.stop('copy_from_device'):.4f} s" ) logger.debug(f"do_one_job: Time unaccounted for: {tk.unaccounted():.4f} s") return stats
from .zmq_adapter import InitSender, WaitAll, SendAll import abc import torch.multiprocessing as _mp mp = _mp.get_context('spawn') class Simulator(mp.Process): ''' Wrapper for simulator. Functions to override: on_init: Initialization after the process has started. restart: restart the environment. terminal: property that tell whether the game has reached terminal get_key: from the key, get the content. e.g. ``get_key("s")`` will give the encoded state of the game. set_key: set the key from replies. e.g., ``set_key("a", 2)`` set the action to be 2 (and the underlying game can continue). ''' def __init__(self, id, desc): ''' Example: desc = dict( actor = dict( input = dict(s="", last_terminal=""), reply = dict(a="") connector = "name1" ), train = dict( input = dict(s="", r="", a=""), reply = None, connector = "name2" ) } '''
def __init__( self, make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, env_fn_args: Sequence[Tuple] = None, auto_reset_done: bool = True, multiprocessing_start_method: str = "forkserver", workers_ignore_signals: bool = False, ) -> None: """.. :param make_env_fn: function which creates a single environment. An environment can be of type :ref:`env.Env` or :ref:`env.RLEnv` :param env_fn_args: tuple of tuple of args to pass to the :ref:`_make_env_fn`. :param auto_reset_done: automatically reset the environment when done. This functionality is provided for seamless training of vectorized environments. :param multiprocessing_start_method: the multiprocessing method used to spawn worker processes. Valid methods are :py:`{'spawn', 'forkserver', 'fork'}`; :py:`'forkserver'` is the recommended method as it works well with CUDA. If :py:`'fork'` is used, the subproccess must be started before any other GPU useage. :param workers_ignore_signals: Whether or not workers will ignore SIGINT and SIGTERM and instead will only exit when :ref:`close` is called """ self._is_waiting = False self._is_closed = True assert ( env_fn_args is not None and len(env_fn_args) > 0 ), "number of environments to be created should be greater than 0" self._num_envs = len(env_fn_args) assert multiprocessing_start_method in self._valid_start_methods, ( "multiprocessing_start_method must be one of {}. Got '{}'").format( self._valid_start_methods, multiprocessing_start_method) self._auto_reset_done = auto_reset_done self._mp_ctx = mp.get_context(multiprocessing_start_method) self._workers = [] ( self._connection_read_fns, self._connection_write_fns, ) = self._spawn_workers( # noqa env_fn_args, make_env_fn, workers_ignore_signals=workers_ignore_signals, ) self._is_closed = False for write_fn in self._connection_write_fns: write_fn((OBSERVATION_SPACE_COMMAND, None)) self.observation_spaces = [ read_fn() for read_fn in self._connection_read_fns ] for write_fn in self._connection_write_fns: write_fn((ACTION_SPACE_COMMAND, None)) self.action_spaces = [ read_fn() for read_fn in self._connection_read_fns ] for write_fn in self._connection_write_fns: write_fn((NUMBER_OF_EPISODES_COMMAND, None)) self.number_of_episodes = [ read_fn() for read_fn in self._connection_read_fns ] self._paused: List[Tuple] = []
if __name__ == '__main__': parser = get_parser() argv = sys.argv[1:] args, _ = parser.parse_known_args(argv) if args.config == 'multiwoz': config = MultiWozConfig() else: raise NotImplementedError( 'Config of the dataset {} not implemented'.format(args.config)) init_logging_handler(args.log_dir) logging.debug(str(args)) try: mp = mp.get_context('spawn') except RuntimeError: pass if args.pretrain: pass elif args.test: pass else: # training current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) logging.debug('train {}'.format(current_time)) print("training start")
output[cnt: cnt + sc.size(0), :] = sc cnt += sc.size(0) act_scores, comp_scores, reg_scores = reorg_stpp.forward(output, prop_ticks, prop_scaling) if reg_scores is not None: reg_scores = reg_scores.view(-1, num_class, 2) reg_scores[:, :, 0] = reg_scores[:, :, 0] * stats[1, 0] + stats[0, 0] reg_scores[:, :, 1] = reg_scores[:, :, 1] * stats[1, 1] + stats[0, 1] # perform stpp on scores result_queue.put((dataset.video_list[index].id, rel_props.numpy(), act_scores.cpu().numpy(), \ comp_scores.cpu().numpy(), reg_scores.cpu().numpy(), output.cpu().numpy())) if __name__ == '__main__': ctx = multiprocessing.get_context('spawn') # this is crucial to using multiprocessing processes with PyTorch # This net is used to provides setup settings. It is not used for testing. net = SSN(num_class, 2, 5, 2, args.modality, test_mode=True, base_model=args.arch, no_regression=args.no_regression, stpp_cfg=stpp_configs) if args.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif args.test_crops == 10: cropping = torchvision.transforms.Compose([ GroupOverSample(net.input_size, net.scale_size) ])
def test_cuda_variable_sharing(self): for requires_grad in [True, False]: var = torch.arange(1., 26, device='cuda').view(5, 5).requires_grad_(requires_grad) self._test_autograd_sharing(var, mp.get_context('spawn'))