def __dp_train(self, model): # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers = self.init_optimizers( model.configure_optimizers()) root_gpu = 0 if type(self.data_parallel_device_ids) is list: root_gpu = self.data_parallel_device_ids[0] model.cuda(root_gpu) # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp: m = f""" Amp level {self.amp_level} with DataParallel is not supported. See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. We recommend you switch to ddp if you want to use amp """ raise MisconfigurationException(m) model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids) self.__run_pretrain_routine(model)
def fit(self, model): # give model convenience properties model.trainer = self model.experiment = self.experiment # transfer data loaders from model self.__get_dataloaders(model) # init training constants self.__layout_bookeeping(model) # CHOOSE OPTIMIZER # filter out the weights that were done on gpu so we can load on good old cpus self.optimizers = model.configure_optimizers() if self.use_amp: # An example model, optimizer = amp.initialize( model, self.optimizers[0], opt_level=self.amp_level, ) self.optimizers[0] = optimizer model.trainer = self # add lr schedulers if self.lr_scheduler_milestones is not None: for optimizer in self.optimizers: scheduler = MultiStepLR(optimizer, self.lr_scheduler_milestones) self.lr_schedulers.append(scheduler) # print model summary model.summarize() # put on gpu if needed if self.on_gpu: model = LightningDataParallel( model, device_ids=self.data_parallel_device_ids) # run tiny validation to make sure program won't crash during val _ = self.validate(model, self.val_dataloader, max_batches=self.nb_sanity_val_steps) # save exp to get started self.experiment.save() # enable cluster checkpointing if self.cluster is not None: self.enable_auto_hpc_walltime_manager() # --------------------------- # CORE TRAINING LOOP # --------------------------- self.model = model self.__train()
def __dp_train(self, model): # CHOOSE OPTIMIZER # filter out the weights that were done on gpu so we can load on good old cpus self.optimizers = model.configure_optimizers() model.cuda(self.data_parallel_device_ids[0]) # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp: m = f'amp level {self.amp_level} with DataParallel is not supported. ' \ f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \ f'We recommend you switch to ddp if you want to use amp' raise MisconfigurationException(m) model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids) self.__run_pretrain_routine(model)