def tpu_launcher(args): import torch_xla.distributed.xla_multiprocessing as xmp # Import training_script as a module. script_path = Path(args.training_script) sys.path.append(str(script_path.parent.resolve())) mod_name = script_path.stem mod = importlib.import_module(mod_name) if not hasattr(mod, args.main_training_function): raise ValueError( f"Your training script should have a function named {args.main_training_function}, or you should pass a " "different value to `--main_training_function`.") main_function = getattr(mod, args.main_training_function) # Patch sys.argv sys.argv = [args.training_script] + args.training_script_args # If the function does not take one argument, launch will fail launcher_sig = inspect.signature(main_function) if len(launcher_sig.parameters) == 0: xmp.spawn(_AddOneArg(main_function), args=(), nprocs=args.num_processes) else: xmp.spawn(main_function, args=(), nprocs=args.num_processes)
def call_main(cfg: FairseqConfig, main, **kwargs): if cfg.distributed_training.distributed_init_method is None: infer_init_method(cfg.distributed_training) if cfg.distributed_training.distributed_init_method is not None: # distributed training if not cfg.distributed_training.distributed_no_spawn: start_rank = cfg.distributed_training.distributed_rank cfg.distributed_training.distributed_rank = None # assign automatically kwargs["start_rank"] = start_rank torch.multiprocessing.spawn( fn=distributed_main, args=(main, cfg, kwargs), nprocs=min( torch.cuda.device_count(), cfg.distributed_training.distributed_world_size, ), join=True, ) else: distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs) elif cfg.common.tpu and cfg.distributed_training.distributed_world_size > 1: import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy("file_system") xmp.spawn( fn=distributed_main, args=(main, cfg, kwargs), nprocs=8, # use all 8 TPU cores ) else: # single GPU main main(cfg, **kwargs)
def cli_main(): args = get_args() if args.use_gpu: return cli_main_gpu(args) # From here on out we are in TPU context args = adjust_args_tpu(args) xmp.spawn(_mp_fn, args=(args,), nprocs=args.num_cores)
def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any: """Launches processes that run the given function in parallel. The function is allowed to have a return value. However, when all processes join, only the return value of worker process 0 gets returned from this `launch` method in the main process. Arguments: function: The entry point for all launched processes. *args: Optional positional arguments to be passed to the given function. trainer: Optional reference to the :class:`~pytorch_lightning.trainer.trainer.Trainer` for which a selected set of attributes get restored in the main process after processes join. **kwargs: Optional keyword arguments to be passed to the given function. """ context = mp.get_context(self._start_method) return_queue = context.SimpleQueue() xmp.spawn( self._wrapping_function, args=(trainer, function, args, kwargs, return_queue), nprocs=len(self._strategy.parallel_devices), start_method=self._start_method, ) worker_output = return_queue.get() if trainer is None: return worker_output self._recover_results_in_main_process(worker_output, trainer) return worker_output.trainer_results
def train(): global net torch.cuda.empty_cache() if not config.USE_TPU: if not config.PARALLEL_FOLD_TRAIN: # for fold in range(2, FOLDS): # run_fold(fold) # run_fold(0) for fold in [3, 4]: net = get_net(name=config.NET, pretrained=config.PRETRAINED) run_fold(fold) # config.NET = "tf_efficientnet_b4_ns" # for fold in [0]: # # global net # net = get_net(name=config.NET, pretrained=config.PRETRAINED) # run_fold(fold) if config.PARALLEL_FOLD_TRAIN: n_jobs = config.FOLDS parallel = Parallel(n_jobs=n_jobs, backend="threading") parallel(delayed(run_fold)(fold) for fold in range(config.FOLDS)) if config.USE_TPU: # if config.MIXED_PRECISION_TRAIN: os.environ["XLA_USE_BF16"] = "1" os.environ["XLA_TENSOR_ALLOCATOR_MAXSIZE"] = "100000000" net = get_net(name=config.NET, pretrained=config.PRETRAINED) for fold in [0]: global FLAGS FLAGS = {"fold": fold} xmp.spawn(tpu, args=(FLAGS, ), nprocs=8, start_method="fork")
def xla_lr_find(self: Learner, num_cores=8, start_method='fork', **kwargs): lr_find_args = { 'start_lr': 1e-7, 'end_lr': 10., 'num_it': 100, 'stop_div': True } fn = Path('_plt_loss.pkl') if fn.is_file(): fn.unlink() # remove show_plot and suggestions param show_plot = kwargs.pop('show_plot', True) suggestions = kwargs.pop('suggestions', True) # override default with kwargs lr_find_args = {**lr_find_args, **kwargs} ctrl_args = self.pre_xla_fit() learner_args, add_args = self.pack_learner_args() xmp.spawn(xla_run_lr_find, args=(learner_args, add_args, lr_find_args, ctrl_args), nprocs=num_cores, start_method=start_method) self.post_xla_fit(ctrl_args) # self.recorder.reload_lr_find_attrs() if show_plot: # show_loss() self.recorder.plot_lr_find() if suggestions: return self.get_suggested_lrs(lr_find_args['num_it'])
def fit(self, train_dataset, dev_dataset, lr, epochs, batch_size, callbacks): if self.using_tpu: xmp.spawn(self.map_fn, args=(train_dataset, dev_dataset, lr, epochs, batch_size, callbacks), nprocs=8, start_method='fork') # hard coding else: index = -1 self.map_fn(index, train_dataset, dev_dataset, lr, epochs, batch_size, callbacks)
def call_main(args, main, **kwargs): if args.distributed_init_method is None: infer_init_method(args) if args.distributed_init_method is not None: # distributed training if not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically kwargs['start_rank'] = start_rank torch.multiprocessing.spawn( fn=distributed_main, args=(main, args, kwargs), nprocs=args.distributed_num_procs, ) else: distributed_main(args.device_id, main, args, kwargs) elif getattr(args, "tpu", False): import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy("file_system") xmp.spawn( fn=distributed_main, args=(main, args, kwargs), nprocs=8, # use all 8 TPU cores ) else: # single GPU main main(args, **kwargs)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--tokenizer_path', default='cache/vocab.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--segment', action='store_true', help='中文以词为单位') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert full_tokenizer = tokenization_bert.BertTokenizer( vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path # if raw: # print('building files') # build_files(raw_data_path=raw_data_path, tokenized_data_path=tokenized_data_path, full_tokenizer=full_tokenizer, # num_pieces=num_pieces) # print('files built') raw_data_files = [ join(raw_data_path, f) for f in listdir(raw_data_path) if isfile(join(raw_data_path, f)) ] random.shuffle(raw_data_files) each_size = len(raw_data_files) // 8 split_raw_data_files = [] for i in range(8): split_raw_data_files.append(raw_data_files[i * each_size:(i + 1) * each_size]) def tokenization(index, raw_data_files): for file_path in raw_data_files[index]: get_tokenization(file_path, tokenized_data_path, full_tokenizer) xmp.spawn(tokenization, args=(split_raw_data_files, ), nprocs=8, start_method='fork')
def spawn(self, function: Callable, *args: Any, **kwargs: Any) -> Optional[Union[Any, "_SpawnOutput"]]: context = mp.get_context(self.start_method or "fork") return_queue = context.SimpleQueue() xmp.spawn(self._wrapped_function, args=(function, args, kwargs, return_queue), **self.get_mp_spawn_kwargs()) return return_queue.get()
def test_tpu_sync_dist(): """Test tpu spawn sync dist operation.""" def test_sync_dist(_): sync = _Sync(TPUSpawnPlugin().reduce, should=True, _op=torch.distributed.ReduceOp.SUM) value = torch.tensor([1.0]) value = (sync(value),) assert value.item() == 8 xmp.spawn(test_sync_dist, nprocs=8, start_method="fork")
def test_broadcast_on_tpu(): """ Checks if an object from the master process is broadcasted to other processes correctly""" def test_broadcast(rank): trainer = Trainer(tpu_cores=8) backend = TPUAccelerator(trainer) obj = ("ver_0.5", "logger_name", rank) result = backend.broadcast(obj) assert result == ("ver_0.5", "logger_name", 0) xmp.spawn(test_broadcast, nprocs=8, start_method='fork')
def train_on_tpu(): def trainer(rank, CONFIG): global config config = CONFIG config.device = xm.xla_device() torch.set_default_tensor_type('torch.FloatTensor') train() xmp.spawn(trainer, args=(config,), nprocs=config.num_cores, start_method='fork')
def run(): if config.MULTI_CORE: flags = {} flags['batch_size'] = config.BATCH_SIZE flags['num_workers'] = 8 flags['num_epochs'] = config.NUM_EPOCHS flags['seed'] = 1234 xmp.spawn(map_fn, args=(flags, ), nprocs=8, start_method='fork') else: map_fn()
def spawn(self, function: Callable, *args: Any, return_result: bool = True, **kwargs: Any) -> Optional[Any]: context = mp.get_context(self.start_method or "fork") return_queue = context.SimpleQueue() if return_result else None xmp.spawn(self._wrapped_function, args=(function, args, kwargs, return_queue), **self.get_mp_spawn_kwargs()) return return_queue.get() if return_result else None
def test_broadcast_on_tpu(): """Checks if an object from the main process is broadcasted to other processes correctly.""" def test_broadcast(rank): trainer = Trainer(accelerator="tpu", devices=8) assert isinstance(trainer.accelerator, TPUAccelerator) assert isinstance(trainer.strategy, TPUSpawnStrategy) obj = ("ver_0.5", "logger_name", rank) result = trainer.strategy.broadcast(obj) assert result == ("ver_0.5", "logger_name", 0) xmp.spawn(test_broadcast, nprocs=8, start_method="fork")
def test_tpu_sync_dist(): """Test tpu spawn sync dist operation """ def test_sync_dist(_): value = LightningModule._LightningModule__sync( torch.tensor([1.0]), sync_fn=TPUSpawnPlugin().reduce, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM) assert value.item() == 8 xmp.spawn(test_sync_dist, nprocs=8, start_method='fork')
def main(): args = parse_args() # Import training_script as a module. mod_name = trim_suffix(os.path.basename(args.training_script), ".py") mod = importlib.import_module(mod_name) # Patch sys.argv sys.argv = [args.training_script] + args.training_script_args + ["--tpu_num_cores", str(args.num_cores)] xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores)
def main(): if len(sys.argv) == 1 or sys.argv[1] not in ('inverter'): print(parse_tools.test_usage, file=stderr) return mode = sys.argv[1] del sys.argv[1] if mode == 'inverter': inv_parser = parse_tools.wav_gen_parser() opts = parse_tools.two_stage_parse(inv_parser) if opts.hwtype == 'GPU': if not torch.cuda.is_available(): raise RuntimeError('GPU requested but not available') elif opts.hwtype in ('TPU', 'TPU-single'): import torch_xla.distributed.xla_multiprocessing as xmp elif opts.hwtype == 'CPU': pass else: raise RuntimeError( ('Invalid device {} requested. ' + 'Must be GPU or TPU').format(opts.hwtype)) print('Using {}'.format(opts.hwtype), file=stderr) stderr.flush() # generate requested data # n_quant = ch.state.model.wavenet.n_quant assert opts.hwtype in ('GPU', 'CPU'), 'Currently, Only GPU or CPU supported for sampling' if opts.hwtype in ('CPU', 'GPU'): chs = chassis.InferenceChassis(mode, opts) if opts.jit_script_path: # data_scr = torch.jit.script(chs.state.data_loader.dataset) model_scr = torch.jit.script(chs.state.model.wavenet) model_scr.save(opts.jit_script_path) model_scr.to(chs.device) # print(model_scr.code) print('saved {}'.format(opts.jit_script_path)) chs.infer(model_scr) return # chs.state.model.print_geometry() chs.infer() elif opts.hwtype == 'TPU': def _mp_fn(index, mode, opts): m = chassis.InferenceChassis(mode, opts) m.infer(index) xmp.spawn(_mp_fn, args=(mode, opts), nprocs=1, start_method='fork') elif opts.hwtype == 'TPU-single': chs = chassis.InferenceChassis(mode, opts) chs.infer()
def test_broadcast_on_tpu(): """ Checks if an object from the master process is broadcasted to other processes correctly""" def test_broadcast(rank): trainer = Trainer(tpu_cores=8) assert isinstance(trainer.accelerator, TPUAccelerator) assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin) obj = ("ver_0.5", "logger_name", rank) result = trainer.training_type_plugin.broadcast(obj) assert result == ("ver_0.5", "logger_name", 0) xmp.spawn(test_broadcast, nprocs=8, start_method='fork')
def benchmark_comms(self): self.initialize_backend( self.comms_world_info.master_ip, self.comms_world_info.master_port, self.commsParams.backend, ) xmp.spawn( fn=self.commsParams.benchTime, args=(self.commsParams, self), nprocs=self.comms_world_info.num_tpu_cores, ) return
def fit(self, model): r""" Runs the full optimization routine. Example:: trainer = Trainer() model = LightningModule() trainer.fit() """ # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp2: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) elif self.use_ddp: if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: mp.spawn(self.ddp_train, nprocs=self.num_gpus, args=(model,)) # 1 gpu or dp option triggers training using DP module # easier to avoid NCCL issues elif self.use_dp: self.dp_train(model) elif self.single_gpu: self.single_gpu_train(model) elif self.use_tpu: log.info(f'training on {self.num_tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. start_method = 'fork' if os.getenv('COLAB_GPU') else 'spawn' xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) # ON CPU else: # run through amp wrapper if self.use_amp: raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers(model.configure_optimizers()) self.run_pretrain_routine(model) # return 1 when finished # used for testing or when we need to know that training succeeded return 1
def _xla_execute(fn, args, nprocs): import torch_xla.distributed.xla_multiprocessing as xmp spawn_kwargs = {} if "COLAB_TPU_ADDR" in os.environ: spawn_kwargs["start_method"] = "fork" try: xmp.spawn(_xla_template_worker_task, args=(fn, args), nprocs=nprocs, **spawn_kwargs) except SystemExit as ex_: assert ex_.code == 0, "Didn't successfully exit in XLA test"
def train(self): model = self.trainer.model # train if self.trainer.tpu_id is not None: self.tpu_train_in_process(self.trainer.tpu_id, model, self.trainer, self.mp_queue) else: xmp.spawn(self.tpu_train_in_process, args=(model, self.trainer, self.mp_queue), nprocs=self.trainer.tpu_cores, start_method=self.start_method)
def main(): args = parse_args() # Import training_script as a module. script_fpath = Path(args.training_script) sys.path.append(str(script_fpath.parent.resolve())) mod_name = script_fpath.stem mod = importlib.import_module(mod_name) # Patch sys.argv sys.argv = [args.training_script] + args.training_script_args + ["--tpu_num_cores", str(args.num_cores)] xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores)
def train(self, model): self.trainer.model = model # train if self.trainer.tpu_id is not None: self.tpu_train_in_process(self.trainer.tpu_id, model) else: xmp.spawn( self.tpu_train_in_process, args=(model,), nprocs=self.trainer.tpu_cores, start_method=self.start_method )
def main(): if len(sys.argv) == 1 or sys.argv[1] not in ('new', 'resume'): print(parse_tools.top_usage, file=stderr) return print('Command line: ', ' '.join(sys.argv), file=stderr) stderr.flush() mode = sys.argv[1] del sys.argv[1] if mode == 'new': cold_parser = parse_tools.cold_parser() opts = parse_tools.two_stage_parse(cold_parser) elif mode == 'resume': resume_parser = parse_tools.resume_parser() opts = resume_parser.parse_args() if opts.hwtype == 'GPU': if not torch.cuda.is_available(): raise RuntimeError('GPU requested but not available') elif opts.hwtype in ('TPU', 'TPU-single'): import torch_xla.distributed.xla_multiprocessing as xmp else: raise RuntimeError( ('Invalid device {} requested. ' + 'Must be GPU or TPU').format(opts.hwtype)) print('Using {}'.format(opts.hwtype), file=stderr) stderr.flush() # Start training print('Training parameters used:', file=stderr) pprint(opts, stderr) # set this to zero if you want to print out a logging header in resume mode as well netmisc.set_print_iter(0) if opts.hwtype == 'GPU': chs = ch.Chassis(mode, opts) # chs.state.model.print_geometry() chs.train(0) elif opts.hwtype == 'TPU': def _mp_fn(index, mode, opts): m = ch.Chassis(mode, opts) m.train(index) xmp.spawn(_mp_fn, args=(mode, opts), nprocs=1, start_method='fork') elif opts.hwtype == 'TPU-single': ch.Chassis(mode, opts).train(0)
def test_tpu_sync_dist(): """Test tpu spawn sync dist operation """ def test_sync_dist(rank): tensor = torch.tensor([1.0]) training_type_plugin = TPUSpawnPlugin() res = Result() res.log("test_tensor", tensor, sync_fn=training_type_plugin.reduce, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM) assert res["test_tensor"].item( ) == 8, "Result-Log does not work properly with TPU Spawn and Tensors" xmp.spawn(test_sync_dist, nprocs=8, start_method='fork')
def spawn(fn: Callable, args: Tuple, kwargs_dict: Optional[Mapping] = None, nproc_per_node: int = 1, nnodes: int = 1, node_rank: int = 0, backend: str = XLA_TPU, **kwargs): if "start_method" not in kwargs: kwargs["start_method"] = "fork" xmp.spawn( _XlaDistModel._dist_worker_task_fn, args=(backend, fn, args, kwargs_dict), nprocs=nproc_per_node, **kwargs, )
def cli_main(modify_parser=None): parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) print_options_meaning_changes(args) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: if not getattr(args, 'tpu', False): # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy('file_system') xmp.spawn( fn=distributed_main, args=(args, ), nprocs=8, # use all 8 TPU cores ) else: # single GPU training main(args)