def create_checkpoint(epoch, netG, optG, netD, optD, max_checkpoint, save_path=config.CHECKPOINT_DIR): print('Saving Model and Optimizer weights.....') checkpoint = { 'epoch': epoch, 'generator_state_dict': netG.state_dict(), 'generator_optimizer': optG.state_dict(), 'discriminator_state_dict': netD.state_dict(), 'discriminator_optimizer': optD.state_dict() } if config.USE_TPU: xm.save(checkpoint, f'{save_path}{epoch}_checkpoint.pt') else: torch.save(checkpoint, f'{save_path}{epoch}_checkpoint.pt') print('Weights Saved !!') del checkpoint files = glob.glob(os.path.expanduser(f"{save_path}*")) sorted_files = sorted(files, key=lambda t: -os.stat(t).st_mtime) if len(sorted_files) > max_checkpoint: os.remove(sorted_files[-1])
def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: checkpoint: dict containing model and trainer state path: write-target path storage_options: not used in ``XLACheckpointIO.save_checkpoint`` Raises: TypeError: If ``storage_options`` arg is passed in """ if storage_options is not None: raise TypeError( "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg" f" is not supported for `{self.__class__.__name__}`. Please implement your custom `CheckpointIO`" " to define how you'd like to use `storage_options`." ) fs = get_filesystem(path) fs.makedirs(os.path.dirname(path), exist_ok=True) # Todo: TypeError: 'mappingproxy' object does not support item assignment # Ref: https://github.com/pytorch/xla/issues/2773 if _OMEGACONF_AVAILABLE: checkpoint = apply_to_collection(checkpoint, (DictConfig, ListConfig), OmegaConf.to_container) xm.save({k: v for k, v in checkpoint.items() if k != "callbacks"}, path)
def save(self, ckpt_file): # cur_device = self.device # self.to(torch.device('cpu')) mstate = pickle.dumps(self.model) mstate_dict = self.model.state_dict() dstate = pickle.dumps(self.data_loader.dataset) ostate = self.optim.state_dict() state = { 'step': self.step, 'model': mstate, 'model_state_dict': mstate_dict, 'dataset': dstate, 'optim': ostate, 'rand_state': torch.get_rng_state(), 'cuda_rand_states': (torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None) } if self.device.type == 'cuda': torch.save(state, ckpt_file) elif self.device.type == 'xla': import torch_xla.core.xla_model as xm xm.save(state, ckpt_file)
def save_pretrained(model, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Only save the model it-self if we are using distributed training model_to_save = model.module if hasattr(model, 'module') else model # Save configuration file model_to_save.config.save_pretrained(save_directory) # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) xm.save(model_to_save.state_dict(), output_model_file) ''' def movecpu(obj): if hasattr(obj,'cpu'): return obj.cpu() elif hasattr(obj, '__dict__'): return {k:movecpu(v) for k,v in obj.__dict__.items()} else: return obj torch.save(movecpu(model_to_save), os.path.join(save_directory, 'debug2.bin')) ''' log_info(f"Model weights saved in {output_model_file}")
def save(self, ckpt_file, epoch, step): # cur_device = self.device old_device = self.to(t.device('cpu')) mstate_dict = self.model.state_dict() ostate = self.optim.state_dict() state = { 'hps': self.hps, 'epoch': epoch, 'step': step, 'optim_step': self.optim_step, 'model_state_dict': mstate_dict, 'optim': ostate, 'rand_state': t.get_rng_state(), 'cuda_rand_states': (t.cuda.get_rng_state_all() if t.cuda.is_available() else None) } if self.hps.hw in ('GPU', 'CPU'): t.save(state, ckpt_file) else: xm.save(state, ckpt_file, master_only=True) self.to(old_device)
def _mp_fn(rank, args): print("rank", rank) device = xm.xla_device() # devices = ( # xm.get_xla_supported_devices( # max_devices=args.num_cores) if args.num_cores != 0 else []) # with _LOAD_LOCK: # _MODEL.to(device) xm.master_print('done loading model') criterion = LabelSmoothedLengthGan_CrossEntropyCriterion( args, translation_self.tgt_dict) params = list(filter(lambda p: p.requires_grad, _MODEL.parameters())) optimizer = FairseqAdam(args, params) lr_scheduler = InverseSquareRootSchedule(args, optimizer) for epoch in range(args.num_epochs): # train_loop_fn(args, _MODEL, criterion, optimizer, device) # valid_log = eval_loop_fn(args, _MODEL, criterion, device) para_loader = pl.ParallelLoader(valid_dataloader, [device]) train_loop_fn(para_loader.per_device_loader(device), args, _MODEL, criterion, optimizer, device) para_loader = pl.ParallelLoader(valid_dataloader, [device]) valid_log = eval_loop_fn(para_loader.per_device_loader(device), args, _MODEL, criterion, device) xm.master_print('Finished training epoch {}'.format(epoch)) xm.master_print( "Epoch {}, loss {:.4f}, nll_loss {:.4f}, length_loss {:.4f}, dis_loss {:.4f}" .format(epoch, valid_log["loss"], valid_log["nll_loss"], valid_log["length_loss"], valid_log["dis_loss"])) lr_scheduler.step(epoch) if args.checkpoint_path: xm.save(_MODEL.state_dict(), args.checkpoint_path)
def save(self, model_path, weights_only=False): model_state_dict = self.state_dict() if weights_only: if self.using_tpu: xm.save(model_state_dict, model_path) else: torch.save(model_state_dict, model_path) return if self.optimizer is not None: opt_state_dict = self.optimizer.state_dict() else: opt_state_dict = None if self.scheduler is not None: sch_state_dict = self.scheduler.state_dict() else: sch_state_dict = None model_dict = {} model_dict["state_dict"] = model_state_dict model_dict["optimizer"] = opt_state_dict model_dict["scheduler"] = sch_state_dict model_dict["epoch"] = self.current_epoch model_dict["fp16"] = self.fp16 if self.using_tpu: xm.save(model_dict, model_path) else: torch.save(model_dict, model_path)
def run(index): MAX_LEN = 512 TRAIN_BATCH_SIZE = 32 EPOCHS = 50 train_dataset = train_dataset train_sampler = torch.utils.data.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=TRAIN_BATCH_SIZE, sampler=train_sampler) valid_dataset = val_dataset valid_sampler = torch.utils.data.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=32, #can make changes here sampler=valid_sampler) device = xm.xla_device() # delegating to TPUs lr = 2e-5 * xm.xrt_world_size() #can make changes here num_train_steps = int( len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS) model = FCN(model, 2048).to(device) PATH = '../input/mymodels/model_niz.pth' model.load_state_dict(torch.load(PATH)) optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8) #eps = 1e-8: to prevent any division by zero scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(EPOCHS): para_loader = pl.ParallelLoader(train_data_loader, [device]) train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler) para_loader = pl.ParallelLoader(valid_data_loader, [device]) o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device) log_loss = [] for jj in range(t.shape[1]): p1 = list(t[:, jj]) p2 = list(o[:, jj]) l = np.nan_to_num(calculate_loss(p1, p2)) log_loss.append(l) log_loss = np.mean(log_loss) xm.master_print(f"epoch={epoch},spearman={log_loss}") xm.save(model.state_dict(), "model3.bin") #change every time
def _save_checkpoint(args, device, step, model, is_epoch=False): if is_epoch: xm.master_print(f"Saving checkpoint at end of epoch") else: xm.master_print(f"Saving checkpoint as step closure of step : {step}") file_name = f"test_train_mnist_cpk_{step}.mdl" xm.save(model, file_name) xm.master_print('done...') xm.master_print(f"Checkpoint saved for device: {device}")
def save(self, name): self.model_path.mkdir(parents=True, exist_ok=True) path = (self.model_path/name).with_suffix('.bin') if self.use_SWA: self.optimizer.swap_swa_sgd() xm.save(self.model.state_dict(), path) self.log(f'Model has been saved')
def test_save_api(self): xla_device = xm.xla_device() model = XlaMNIST().to(xla_device) with tempfile.NamedTemporaryFile() as tf: xm.save(model.state_dict(), tf) state_dict = torch.load(tf.name) cpu_model = XlaMNIST() cpu_model.load_state_dict(state_dict) loaded_model = cpu_model.to(xla_device) self.assertEqual(model.state_dict(), loaded_model.state_dict())
def save(self, path): self.model.eval() #xser.save(self.model.state_dict(), path, master_only=True, global_master=True ) xm.save({ 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'scheduler_state_dict': self.scheduler.state_dict(), 'best_summary_loss': self.best_summary_loss, 'epoch': self.epoch, }, path)
def _save_checkpoint(self) -> None: """ Save the model's current parameters and the training state to a checkpoint. The training state contains the total number of training steps, the total number of training tokens, the best checkpoint score and iteration so far, and optimizer and scheduler states. """ model_path = "{}/{}.ckpt".format(self.model_dir, self.steps) state = { "steps": self.steps, "total_tokens": self.total_tokens, "best_ckpt_score": self.best_ckpt_score, "best_ckpt_iteration": self.best_ckpt_iteration, "model_state": self.model.state_dict(), "optimizer_state": self.optimizer.state_dict(), "scheduler_state": self.scheduler.state_dict() if self.scheduler is not None else None, } if not self.use_tpu: torch.save(state, model_path) else: xm.save(state, model_path) if self.ckpt_queue.full(): to_delete = self.ckpt_queue.get() # delete oldest ckpt try: os.remove(to_delete) except FileNotFoundError: self.logger.warning( "Wanted to delete old checkpoint %s but " "file does not exist.", to_delete) self.ckpt_queue.put(model_path) best_path = "{}/best.ckpt".format(self.model_dir) try: # create/modify symbolic link for best checkpoint symlink_update("{}.ckpt".format(self.steps), best_path) except OSError: # overwrite best.ckpt if not self.use_tpu: torch.save(state, best_path) else: xm.save(state, best_path)
def save(self, state_dict: Dict, path: str) -> None: """ Saving with ``xm.save`` can be unstable and miss the rendez-vous after ``torch.save``. The rendez-vous doesn't affect directly saving. We can ignore the ``RuntimeError`` to reduce friction with TPUs. """ try: xm.save(state_dict, path) except RuntimeError as e: if "Failed to meet rendezvous" not in str(e): raise e
def save_checkpoint(self, filepath, weights_only: bool = False): """Save model/training states as a checkpoint file through state-dump and file-write. Args: filepath: write-target file's path weights_only: saving model weights only """ # dump states as a checkpoint dictionary object _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) # Todo: TypeError: 'mappingproxy' object does not support item assignment xm.save({k: v for k, v in _checkpoint.items() if k != "callbacks"}, filepath)
def save_model_state_dict(self, save_dir: str): path = f"{save_dir}/model.pt" module = self.model.module if hasattr(self.model, "module") else self.model state_dict = module.state_dict() if self.tpus > 0: xm.save(state_dict, path) else: torch.save(state_dict, path)
def save(obj, f): """ Save the data to disk. Use in place of :obj:`torch.save()`. Args: obj: The data to save f: The file (or file-like object) to use to save the data """ if AcceleratorState().distributed_type == DistributedType.TPU: xm.save(obj, f) elif AcceleratorState().local_process_index == 0: torch.save(obj, f)
def save_checkpoint(self, epoch_score, model, model_path): if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]: if self.tpu: xm.master_print( "Validation score improved ({} --> {}). Saving model!". format(self.val_score, epoch_score)) else: print("Validation score improved ({} --> {}). Saving model!". format(self.val_score, epoch_score)) if self.tpu: xm.save(model.state_dict(), model_path) else: torch.save(model.state_dict(), model_path) self.val_score = epoch_score
def save_checkpoint(self): state = { 'epoch': self.epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict() } filename = str(self.save_dir + '/models/checkpoint-epoch{}.pth'.format(self.epoch)) if self.tpu: xm.save(state, filename) else: torch.save(state, filename) print("Saving checkpoint: {} ...".format(filename))
def __call__(self, checkpoint: Mapping, filename: str) -> None: path = os.path.join(self.dirname, filename) if not self._atomic: xm.save(checkpoint, path) else: tmp = tempfile.NamedTemporaryFile(delete=False, dir=self.dirname) try: xm.save(checkpoint, tmp.file) except BaseException: tmp.close() os.remove(tmp.name) raise else: tmp.close() os.rename(tmp.name, path)
def save_pretrained(model, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. """ assert os.path.isdir( save_directory ), "Saving path should be a directory where the model and configuration can be saved" # Only save the model it-self if we are using distributed training model_to_save = model.module if hasattr(model, 'module') else model # Save configuration file model_to_save.config.save_pretrained(save_directory) # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) xm.save(model_to_save.state_dict(), output_model_file) log_info(f"Model weights saved in {output_model_file}")
def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: checkpoint: dict containing model and trainer state path: write-target path storage_options: Optional parameters when saving the model/training states. """ # Todo: TypeError: 'mappingproxy' object does not support item assignment # Ref: https://github.com/pytorch/xla/issues/2773 if _OMEGACONF_AVAILABLE: checkpoint = apply_to_collection(checkpoint, (DictConfig, ListConfig), OmegaConf.to_container) xm.save({k: v for k, v in checkpoint.items() if k != "callbacks"}, path)
def transfer_distrib_spawn_state_on_fit_end(self, results): # TODO: is there a better way than accessing callback through model -> trainer -> callback? best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path if self.mp_queue is not None: rank_zero_warn("cleaning up ddp environment...") # save the last weights last_path = None # TODO: is there a better way than accessing trainer through model -> trainer? if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0: last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path) xm.save(self.lightning_module.state_dict(), last_path) if self.global_rank == 0: # todo, pass complete checkpoint as state dictionary self.mp_queue.put(best_model_path) self.mp_queue.put(last_path) self.mp_queue.put(results)
def _mp_fn(index, temp_file): device = xm.xla_device() dd = _create_state_dict(device) xm.save(dd, temp_file) ldd = torch.load(temp_file) pdd = _get_data_str(ldd) data = xm.rendezvous('xm_save_test', pdd) if xm.get_local_ordinal() == 0: os.remove(temp_file) for i in range(1, len(data)): bio = io.BytesIO(data[i]) ildd = torch.load(bio) for k, v in ldd.items(): if isinstance(v, torch.Tensor): assert v.allclose(ildd[k]) elif isinstance(v, (list, tuple)): iv = ildd[k] for a, b in zip(v, iv): assert a.allclose(b) else: raise RuntimeError('Invalid data type')
def save(self, state_dict: Dict, path: str, save_spawn: bool = False) -> None: """ Saving with ``xm.save`` can be unstable and miss the rendez-vous after ``torch.save``. The rendez-vous doesn't affect directly saving. We can ignore the ``RuntimeError`` to reduce friction with TPUs. """ try: rank_zero_warn("Calling save function @ path " + str(path) + " " + str(save_spawn)) if save_spawn: rank_zero_warn("Using xser save @ path " + str(path)) xser.save(state_dict, path, master_only=True) else: rank_zero_warn("Using xm save @ path " + str(path)) xm.save(state_dict, path) rank_zero_warn("Finished saving @ path " + str(path)) except RuntimeError as e: if "Failed to meet rendezvous" not in str(e): raise e
def save_pretrained(self, save_directory): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. Arguments: save_directory (:obj:`str`): Directory to which to save. Will be created if it doesn't exist. """ if os.path.isfile(save_directory): print( "Provided path ({}) should be a directory, not a file".format( save_directory)) return os.makedirs(save_directory, exist_ok=True) # Only save the model itself if we are using distributed training model_to_save = self.module if hasattr(self, "module") else self # Attach architecture to the config model_to_save.config.architectures = [model_to_save.__class__.__name__] # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) if getattr(self.config, "xla_device", False): import torch_xla.core.xla_model as xm if xm.is_master_ordinal(): # Save configuration file model_to_save.config.save_pretrained(save_directory) # xm.save takes care of saving only from master xm.save(model_to_save.state_dict(), output_model_file) else: model_to_save.config.save_pretrained(save_directory) torch.save(model_to_save.state_dict(), output_model_file) print("Model weights saved in {}".format(output_model_file))
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = (self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1) else: t_total = int( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device)) scheduler.load_state_dict( torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size( ) else: total_train_batch_size = (self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // ( len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master()) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance( train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader( train_dataloader, [self.args.device]).per_device_loader(self.args.device) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if self.args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or (self.global_step == 1 and self.args.logging_first_step): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0]) logging_loss = tr_loss self._log(logs) if self.args.evaluate_during_training: self.evaluate() if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join( self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save( optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save( scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" ) return TrainOutput(self.global_step, tr_loss / self.global_step)
def run(index): MAX_LEN = 512 TRAIN_BATCH_SIZE = 16 EPOCHS = 50 dfx = pd.read_csv("/home/nizamphoenix/dataset/train.csv").fillna("none") df_train, df_valid = model_selection.train_test_split(dfx, random_state=42, test_size=0.3) df_train = df_train.reset_index(drop=True) df_valid = df_valid.reset_index(drop=True) sample = pd.read_csv("/home/nizamphoenix/dataset/sample_submission.csv") target_cols = list(sample.drop("qa_id", axis=1).columns) train_targets = df_train[target_cols].values valid_targets = df_valid[target_cols].values tokenizer = transformers.BertTokenizer.from_pretrained( "/home/nizamphoenix/bert-base-uncased/") train_dataset = BERTDatasetTraining(qtitle=df_train.question_title.values, qbody=df_train.question_body.values, answer=df_train.answer.values, targets=train_targets, tokenizer=tokenizer, max_len=MAX_LEN) train_sampler = torch.utils.data.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=TRAIN_BATCH_SIZE, sampler=train_sampler) valid_dataset = BERTDatasetTraining(qtitle=df_valid.question_title.values, qbody=df_valid.question_body.values, answer=df_valid.answer.values, targets=valid_targets, tokenizer=tokenizer, max_len=MAX_LEN) valid_sampler = torch.utils.data.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=8, #can make changes here sampler=valid_sampler) device = xm.xla_device() lr = 2e-5 * xm.xrt_world_size() #can make changes here num_train_steps = int( len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS) model = BERTBaseUncased("/home/nizamphoenix/bert-base-uncased/").to(device) optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8) #eps = 1e-8: to prevent any division by zero scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) for epoch in range(EPOCHS): para_loader = pl.ParallelLoader(train_data_loader, [device]) train_loop_fn(para_loader.per_device_loader(device), model, optimizer, device, scheduler) para_loader = pl.ParallelLoader(valid_data_loader, [device]) o, t = eval_loop_fn(para_loader.per_device_loader(device), model, device) spear = [] for jj in range(t.shape[1]): p1 = list(t[:, jj]) p2 = list(o[:, jj]) coef, _ = np.nan_to_num(stats.spearmanr(p1, p2)) spear.append(coef) spear = np.mean(spear) xm.master_print(f"epoch={epoch},spearman={spear}") xm.save(model.state_dict(), "model3.bin") #change every time
def train_loop(folds, fold): if CFG.device == 'GPU': LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info(f"========== fold: {fold} training ==========") elif CFG.nprocs == 8: xm.master_print(f"========== fold: {fold} training ==========") # ==================================================== # loader # ==================================================== trn_idx = folds[folds['fold'] != fold].index val_idx = folds[folds['fold'] == fold].index train_folds = folds.loc[trn_idx].reset_index(drop=True) valid_folds = folds.loc[val_idx].reset_index(drop=True) valid_labels = valid_folds[CFG.target_cols].values train_dataset = TrainDataset(train_folds, transform=get_transforms(data='train')) valid_dataset = TrainDataset(valid_folds, transform=get_transforms(data='valid')) if CFG.device == 'GPU': train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True, drop_last=True) valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=CFG.num_workers, pin_memory=True, drop_last=False) elif CFG.device == 'TPU': train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=CFG.batch_size, sampler=train_sampler, drop_last=True, num_workers=CFG.num_workers) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size * 2, sampler=valid_sampler, drop_last=False, num_workers=CFG.num_workers) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.eps) elif CFG.scheduler == 'CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler == 'CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== if CFG.device == 'TPU': device = xm.xla_device() elif CFG.device == 'GPU': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = CustomResNet200D_WLF(CFG.model_name, pretrained=False) model.load_state_dict( torch.load(CFG.student, map_location=torch.device('cpu'))['model']) model.to(device) optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False) scheduler = get_scheduler(optimizer) # ==================================================== # loop # ==================================================== criterion = nn.BCEWithLogitsLoss() best_score = 0. best_loss = np.inf for epoch in range(CFG.epochs): start_time = time.time() # train if CFG.device == 'TPU': if CFG.nprocs == 1: avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device) elif CFG.nprocs == 8: para_train_loader = pl.ParallelLoader(train_loader, [device]) avg_loss = train_fn( para_train_loader.per_device_loader(device), model, criterion, optimizer, epoch, scheduler, device) elif CFG.device == 'GPU': avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device) # eval if CFG.device == 'TPU': if CFG.nprocs == 1: avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device) elif CFG.nprocs == 8: para_valid_loader = pl.ParallelLoader(valid_loader, [device]) avg_val_loss, preds, valid_labels = valid_fn( para_valid_loader.per_device_loader(device), model, criterion, device) preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy() valid_labels = idist.all_gather( torch.tensor(valid_labels)).to('cpu').numpy() elif CFG.device == 'GPU': avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion, device) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_val_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() # scoring score, scores = get_score(valid_labels, preds) elapsed = time.time() - start_time if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) LOGGER.info( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s' ) xm.master_print( f'Epoch {epoch+1} - Score: {score:.4f} Scores: {np.round(scores, decimals=4)}' ) if score > best_score: best_score = score if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) torch.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model' ) xm.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') if avg_val_loss < best_loss: best_loss = avg_val_loss if CFG.device == 'GPU': LOGGER.info( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model') torch.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth') elif CFG.device == 'TPU': if CFG.nprocs == 1: LOGGER.info( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model' ) elif CFG.nprocs == 8: xm.master_print( f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model' ) xm.save({ 'model': model.state_dict(), 'preds': preds }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth') if CFG.nprocs != 8: check_point = torch.load( OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth') for c in [f'pred_{c}' for c in CFG.target_cols]: valid_folds[c] = np.nan valid_folds[[f'pred_{c}' for c in CFG.target_cols]] = check_point['preds'] return valid_folds
def save_func(self, *args): return xm.save(*args) if is_xla() else torch.save(*args)