コード例 #1
0
def create_checkpoint(epoch,
                      netG,
                      optG,
                      netD,
                      optD,
                      max_checkpoint,
                      save_path=config.CHECKPOINT_DIR):
    print('Saving Model and Optimizer weights.....')
    checkpoint = {
        'epoch': epoch,
        'generator_state_dict': netG.state_dict(),
        'generator_optimizer': optG.state_dict(),
        'discriminator_state_dict': netD.state_dict(),
        'discriminator_optimizer': optD.state_dict()
    }
    if config.USE_TPU:
        xm.save(checkpoint, f'{save_path}{epoch}_checkpoint.pt')
    else:
        torch.save(checkpoint, f'{save_path}{epoch}_checkpoint.pt')
    print('Weights Saved !!')
    del checkpoint
    files = glob.glob(os.path.expanduser(f"{save_path}*"))
    sorted_files = sorted(files, key=lambda t: -os.stat(t).st_mtime)
    if len(sorted_files) > max_checkpoint:
        os.remove(sorted_files[-1])
コード例 #2
0
    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
        """Save model/training states as a checkpoint file through state-dump and file-write.

        Args:
            checkpoint: dict containing model and trainer state
            path: write-target path
            storage_options: not used in ``XLACheckpointIO.save_checkpoint``

        Raises:
            TypeError:
                If ``storage_options`` arg is passed in
        """
        if storage_options is not None:
            raise TypeError(
                "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
                f" is not supported for `{self.__class__.__name__}`. Please implement your custom `CheckpointIO`"
                " to define how you'd like to use `storage_options`."
            )
        fs = get_filesystem(path)
        fs.makedirs(os.path.dirname(path), exist_ok=True)
        # Todo: TypeError: 'mappingproxy' object does not support item assignment
        # Ref: https://github.com/pytorch/xla/issues/2773
        if _OMEGACONF_AVAILABLE:
            checkpoint = apply_to_collection(checkpoint, (DictConfig, ListConfig), OmegaConf.to_container)
        xm.save({k: v for k, v in checkpoint.items() if k != "callbacks"}, path)
コード例 #3
0
    def save(self, ckpt_file):
        # cur_device = self.device
        # self.to(torch.device('cpu'))

        mstate = pickle.dumps(self.model)
        mstate_dict = self.model.state_dict()
        dstate = pickle.dumps(self.data_loader.dataset)
        ostate = self.optim.state_dict()
        state = {
            'step':
            self.step,
            'model':
            mstate,
            'model_state_dict':
            mstate_dict,
            'dataset':
            dstate,
            'optim':
            ostate,
            'rand_state':
            torch.get_rng_state(),
            'cuda_rand_states': (torch.cuda.get_rng_state_all()
                                 if torch.cuda.is_available() else None)
        }
        if self.device.type == 'cuda':
            torch.save(state, ckpt_file)
        elif self.device.type == 'xla':
            import torch_xla.core.xla_model as xm
            xm.save(state, ckpt_file)
コード例 #4
0
def save_pretrained(model, save_directory):
    """ Save a model and its configuration file to a directory, so that it
        can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
    """
    assert os.path.isdir(
        save_directory
    ), "Saving path should be a directory where the model and configuration can be saved"

    # Only save the model it-self if we are using distributed training
    model_to_save = model.module if hasattr(model, 'module') else model

    # Save configuration file
    model_to_save.config.save_pretrained(save_directory)

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(save_directory, WEIGHTS_NAME)

    xm.save(model_to_save.state_dict(), output_model_file)
    '''
    def movecpu(obj):
        if hasattr(obj,'cpu'):
            return obj.cpu()
        elif hasattr(obj, '__dict__'):
            return {k:movecpu(v) for k,v in obj.__dict__.items()}
        else: 
            return obj

    torch.save(movecpu(model_to_save), os.path.join(save_directory, 'debug2.bin'))
    '''
    log_info(f"Model weights saved in {output_model_file}")
コード例 #5
0
 def save(self, ckpt_file, epoch, step):
     # cur_device = self.device
     old_device = self.to(t.device('cpu'))
     mstate_dict = self.model.state_dict()
     ostate = self.optim.state_dict()
     state = {
         'hps':
         self.hps,
         'epoch':
         epoch,
         'step':
         step,
         'optim_step':
         self.optim_step,
         'model_state_dict':
         mstate_dict,
         'optim':
         ostate,
         'rand_state':
         t.get_rng_state(),
         'cuda_rand_states':
         (t.cuda.get_rng_state_all() if t.cuda.is_available() else None)
     }
     if self.hps.hw in ('GPU', 'CPU'):
         t.save(state, ckpt_file)
     else:
         xm.save(state, ckpt_file, master_only=True)
     self.to(old_device)
コード例 #6
0
ファイル: train.py プロジェクト: PanXiebit/fairseq-tpu
def _mp_fn(rank, args):
    print("rank", rank)
    device = xm.xla_device()
    # devices = (
    #   xm.get_xla_supported_devices(
    #       max_devices=args.num_cores) if args.num_cores != 0 else [])
    # with _LOAD_LOCK:
    #     _MODEL.to(device)
    xm.master_print('done loading model')

    criterion = LabelSmoothedLengthGan_CrossEntropyCriterion(
        args, translation_self.tgt_dict)

    params = list(filter(lambda p: p.requires_grad, _MODEL.parameters()))
    optimizer = FairseqAdam(args, params)
    lr_scheduler = InverseSquareRootSchedule(args, optimizer)

    for epoch in range(args.num_epochs):
        # train_loop_fn(args, _MODEL, criterion, optimizer, device)
        # valid_log = eval_loop_fn(args, _MODEL, criterion, device)
        para_loader = pl.ParallelLoader(valid_dataloader, [device])
        train_loop_fn(para_loader.per_device_loader(device), args, _MODEL,
                      criterion, optimizer, device)
        para_loader = pl.ParallelLoader(valid_dataloader, [device])
        valid_log = eval_loop_fn(para_loader.per_device_loader(device), args,
                                 _MODEL, criterion, device)
        xm.master_print('Finished training epoch {}'.format(epoch))

        xm.master_print(
            "Epoch {}, loss {:.4f}, nll_loss {:.4f}, length_loss {:.4f}, dis_loss {:.4f}"
            .format(epoch, valid_log["loss"], valid_log["nll_loss"],
                    valid_log["length_loss"], valid_log["dis_loss"]))
        lr_scheduler.step(epoch)
        if args.checkpoint_path:
            xm.save(_MODEL.state_dict(), args.checkpoint_path)
コード例 #7
0
ファイル: model.py プロジェクト: abhi1thakur/tez
 def save(self, model_path, weights_only=False):
     model_state_dict = self.state_dict()
     if weights_only:
         if self.using_tpu:
             xm.save(model_state_dict, model_path)
         else:
             torch.save(model_state_dict, model_path)
         return
     if self.optimizer is not None:
         opt_state_dict = self.optimizer.state_dict()
     else:
         opt_state_dict = None
     if self.scheduler is not None:
         sch_state_dict = self.scheduler.state_dict()
     else:
         sch_state_dict = None
     model_dict = {}
     model_dict["state_dict"] = model_state_dict
     model_dict["optimizer"] = opt_state_dict
     model_dict["scheduler"] = sch_state_dict
     model_dict["epoch"] = self.current_epoch
     model_dict["fp16"] = self.fp16
     if self.using_tpu:
         xm.save(model_dict, model_path)
     else:
         torch.save(model_dict, model_path)
コード例 #8
0
ファイル: train_tpu.py プロジェクト: nizamphoenix/kaggle
def run(index):
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 32
    EPOCHS = 50

    train_dataset = train_dataset
    train_sampler = torch.utils.data.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, sampler=train_sampler)

    valid_dataset = val_dataset
    valid_sampler = torch.utils.data.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
    )
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=32,  #can make changes here
        sampler=valid_sampler)

    device = xm.xla_device()  # delegating to TPUs

    lr = 2e-5 * xm.xrt_world_size()  #can make changes here
    num_train_steps = int(
        len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)

    model = FCN(model, 2048).to(device)
    PATH = '../input/mymodels/model_niz.pth'
    model.load_state_dict(torch.load(PATH))

    optimizer = AdamW(model.parameters(), lr=lr,
                      eps=1e-8)  #eps = 1e-8: to prevent any division by zero

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    for epoch in range(EPOCHS):
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        train_loop_fn(para_loader.per_device_loader(device), model, optimizer,
                      device, scheduler)

        para_loader = pl.ParallelLoader(valid_data_loader, [device])
        o, t = eval_loop_fn(para_loader.per_device_loader(device), model,
                            device)

        log_loss = []
        for jj in range(t.shape[1]):
            p1 = list(t[:, jj])
            p2 = list(o[:, jj])
            l = np.nan_to_num(calculate_loss(p1, p2))
            log_loss.append(l)
        log_loss = np.mean(log_loss)
        xm.master_print(f"epoch={epoch},spearman={log_loss}")
        xm.save(model.state_dict(), "model3.bin")  #change every time
コード例 #9
0
ファイル: test_train_while.py プロジェクト: Cerebras/ptwse
def _save_checkpoint(args, device, step, model, is_epoch=False):
    if is_epoch:
        xm.master_print(f"Saving checkpoint at end of epoch")
    else:
        xm.master_print(f"Saving checkpoint as step closure of step : {step}")
    file_name = f"test_train_mnist_cpk_{step}.mdl"
    xm.save(model, file_name)
    xm.master_print('done...')
    xm.master_print(f"Checkpoint saved for device: {device}")
コード例 #10
0
    def save(self, name):
        self.model_path.mkdir(parents=True, exist_ok=True)
        path = (self.model_path/name).with_suffix('.bin')
        
        if self.use_SWA:
            self.optimizer.swap_swa_sgd()

        xm.save(self.model.state_dict(), path)
        self.log(f'Model has been saved')
コード例 #11
0
 def test_save_api(self):
   xla_device = xm.xla_device()
   model = XlaMNIST().to(xla_device)
   with tempfile.NamedTemporaryFile() as tf:
     xm.save(model.state_dict(), tf)
     state_dict = torch.load(tf.name)
   cpu_model = XlaMNIST()
   cpu_model.load_state_dict(state_dict)
   loaded_model = cpu_model.to(xla_device)
   self.assertEqual(model.state_dict(), loaded_model.state_dict())
コード例 #12
0
 def save(self, path):
     self.model.eval()
     #xser.save(self.model.state_dict(), path, master_only=True, global_master=True )
     xm.save({
         'model_state_dict': self.model.state_dict(),
         'optimizer_state_dict': self.optimizer.state_dict(),
         'scheduler_state_dict': self.scheduler.state_dict(),
         'best_summary_loss': self.best_summary_loss,
         'epoch': self.epoch,
     }, path)
コード例 #13
0
    def _save_checkpoint(self) -> None:
        """
        Save the model's current parameters and the training state to a
        checkpoint.

        The training state contains the total number of training steps,
        the total number of training tokens,
        the best checkpoint score and iteration so far,
        and optimizer and scheduler states.

        """
        model_path = "{}/{}.ckpt".format(self.model_dir, self.steps)
        state = {
            "steps":
            self.steps,
            "total_tokens":
            self.total_tokens,
            "best_ckpt_score":
            self.best_ckpt_score,
            "best_ckpt_iteration":
            self.best_ckpt_iteration,
            "model_state":
            self.model.state_dict(),
            "optimizer_state":
            self.optimizer.state_dict(),
            "scheduler_state":
            self.scheduler.state_dict()
            if self.scheduler is not None else None,
        }
        if not self.use_tpu:
            torch.save(state, model_path)
        else:
            xm.save(state, model_path)

        if self.ckpt_queue.full():
            to_delete = self.ckpt_queue.get()  # delete oldest ckpt
            try:
                os.remove(to_delete)
            except FileNotFoundError:
                self.logger.warning(
                    "Wanted to delete old checkpoint %s but "
                    "file does not exist.", to_delete)

        self.ckpt_queue.put(model_path)

        best_path = "{}/best.ckpt".format(self.model_dir)
        try:
            # create/modify symbolic link for best checkpoint
            symlink_update("{}.ckpt".format(self.steps), best_path)
        except OSError:
            # overwrite best.ckpt
            if not self.use_tpu:
                torch.save(state, best_path)
            else:
                xm.save(state, best_path)
コード例 #14
0
 def save(self, state_dict: Dict, path: str) -> None:
     """
     Saving with ``xm.save`` can be unstable and miss the rendez-vous after ``torch.save``.
     The rendez-vous doesn't affect directly saving.
     We can ignore the ``RuntimeError`` to reduce friction with TPUs.
     """
     try:
         xm.save(state_dict, path)
     except RuntimeError as e:
         if "Failed to meet rendezvous" not in str(e):
             raise e
コード例 #15
0
    def save_checkpoint(self, filepath, weights_only: bool = False):
        """Save model/training states as a checkpoint file through state-dump and file-write.

        Args:
            filepath: write-target file's path
            weights_only: saving model weights only
        """
        # dump states as a checkpoint dictionary object
        _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
        # Todo: TypeError: 'mappingproxy' object does not support item assignment
        xm.save({k: v for k, v in _checkpoint.items() if k != "callbacks"}, filepath)
コード例 #16
0
    def save_model_state_dict(self, save_dir: str):

        path = f"{save_dir}/model.pt"

        module = self.model.module if hasattr(self.model,
                                              "module") else self.model
        state_dict = module.state_dict()

        if self.tpus > 0:
            xm.save(state_dict, path)
        else:
            torch.save(state_dict, path)
コード例 #17
0
ファイル: utils.py プロジェクト: lichao312214129/accelerate
def save(obj, f):
    """
    Save the data to disk. Use in place of :obj:`torch.save()`.

    Args:
        obj: The data to save
        f: The file (or file-like object) to use to save the data
    """
    if AcceleratorState().distributed_type == DistributedType.TPU:
        xm.save(obj, f)
    elif AcceleratorState().local_process_index == 0:
        torch.save(obj, f)
コード例 #18
0
 def save_checkpoint(self, epoch_score, model, model_path):
     if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
         if self.tpu:
             xm.master_print(
                 "Validation score improved ({} --> {}). Saving model!".
                 format(self.val_score, epoch_score))
         else:
             print("Validation score improved ({} --> {}). Saving model!".
                   format(self.val_score, epoch_score))
         if self.tpu:
             xm.save(model.state_dict(), model_path)
         else:
             torch.save(model.state_dict(), model_path)
     self.val_score = epoch_score
コード例 #19
0
    def save_checkpoint(self):
        state = {
            'epoch': self.epoch,
            'state_dict': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }

        filename = str(self.save_dir +
                       '/models/checkpoint-epoch{}.pth'.format(self.epoch))
        if self.tpu:
            xm.save(state, filename)
        else:
            torch.save(state, filename)
        print("Saving checkpoint: {} ...".format(filename))
コード例 #20
0
 def __call__(self, checkpoint: Mapping, filename: str) -> None:
     path = os.path.join(self.dirname, filename)
     if not self._atomic:
         xm.save(checkpoint, path)
     else:
         tmp = tempfile.NamedTemporaryFile(delete=False, dir=self.dirname)
         try:
             xm.save(checkpoint, tmp.file)
         except BaseException:
             tmp.close()
             os.remove(tmp.name)
             raise
         else:
             tmp.close()
             os.rename(tmp.name, path)
コード例 #21
0
def save_pretrained(model, save_directory):
    """ Save a model and its configuration file to a directory, so that it
        can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
    """
    assert os.path.isdir(
        save_directory
    ), "Saving path should be a directory where the model and configuration can be saved"

    # Only save the model it-self if we are using distributed training
    model_to_save = model.module if hasattr(model, 'module') else model

    # Save configuration file
    model_to_save.config.save_pretrained(save_directory)

    # If we save using the predefined names, we can load using `from_pretrained`
    output_model_file = os.path.join(save_directory, WEIGHTS_NAME)

    xm.save(model_to_save.state_dict(), output_model_file)
    log_info(f"Model weights saved in {output_model_file}")
コード例 #22
0
    def save_checkpoint(self,
                        checkpoint: Dict[str, Any],
                        path: _PATH,
                        storage_options: Optional[Any] = None) -> None:
        """Save model/training states as a checkpoint file through state-dump and file-write.

        Args:
            checkpoint: dict containing model and trainer state
            path: write-target path
            storage_options: Optional parameters when saving the model/training states.
        """
        # Todo: TypeError: 'mappingproxy' object does not support item assignment
        # Ref: https://github.com/pytorch/xla/issues/2773
        if _OMEGACONF_AVAILABLE:
            checkpoint = apply_to_collection(checkpoint,
                                             (DictConfig, ListConfig),
                                             OmegaConf.to_container)
        xm.save({k: v
                 for k, v in checkpoint.items() if k != "callbacks"}, path)
コード例 #23
0
    def transfer_distrib_spawn_state_on_fit_end(self, results):
        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path

        if self.mp_queue is not None:
            rank_zero_warn("cleaning up ddp environment...")

            # save the last weights
            last_path = None
            # TODO: is there a better way than accessing trainer through model -> trainer?
            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
                last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path)
                xm.save(self.lightning_module.state_dict(), last_path)

            if self.global_rank == 0:
                # todo, pass complete checkpoint as state dictionary
                self.mp_queue.put(best_model_path)
                self.mp_queue.put(last_path)
                self.mp_queue.put(results)
コード例 #24
0
def _mp_fn(index, temp_file):
    device = xm.xla_device()
    dd = _create_state_dict(device)
    xm.save(dd, temp_file)
    ldd = torch.load(temp_file)
    pdd = _get_data_str(ldd)
    data = xm.rendezvous('xm_save_test', pdd)
    if xm.get_local_ordinal() == 0:
        os.remove(temp_file)
    for i in range(1, len(data)):
        bio = io.BytesIO(data[i])
        ildd = torch.load(bio)
        for k, v in ldd.items():
            if isinstance(v, torch.Tensor):
                assert v.allclose(ildd[k])
            elif isinstance(v, (list, tuple)):
                iv = ildd[k]
                for a, b in zip(v, iv):
                    assert a.allclose(b)
            else:
                raise RuntimeError('Invalid data type')
コード例 #25
0
 def save(self,
          state_dict: Dict,
          path: str,
          save_spawn: bool = False) -> None:
     """
     Saving with ``xm.save`` can be unstable and miss the rendez-vous after ``torch.save``.
     The rendez-vous doesn't affect directly saving.
     We can ignore the ``RuntimeError`` to reduce friction with TPUs.
     """
     try:
         rank_zero_warn("Calling save function @ path " + str(path) + " " +
                        str(save_spawn))
         if save_spawn:
             rank_zero_warn("Using xser save @ path " + str(path))
             xser.save(state_dict, path, master_only=True)
         else:
             rank_zero_warn("Using xm save @ path " + str(path))
             xm.save(state_dict, path)
         rank_zero_warn("Finished saving @ path " + str(path))
     except RuntimeError as e:
         if "Failed to meet rendezvous" not in str(e):
             raise e
コード例 #26
0
    def save_pretrained(self, save_directory):
        """
        Save a model and its configuration file to a directory, so that it can be re-loaded using the
        `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
        Arguments:
            save_directory (:obj:`str`):
                Directory to which to save. Will be created if it doesn't exist.
        """
        if os.path.isfile(save_directory):
            print(
                "Provided path ({}) should be a directory, not a file".format(
                    save_directory))
            return
        os.makedirs(save_directory, exist_ok=True)

        # Only save the model itself if we are using distributed training
        model_to_save = self.module if hasattr(self, "module") else self

        # Attach architecture to the config
        model_to_save.config.architectures = [model_to_save.__class__.__name__]

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)

        if getattr(self.config, "xla_device", False):
            import torch_xla.core.xla_model as xm

            if xm.is_master_ordinal():
                # Save configuration file
                model_to_save.config.save_pretrained(save_directory)
            # xm.save takes care of saving only from master
            xm.save(model_to_save.state_dict(), output_model_file)
        else:
            model_to_save.config.save_pretrained(save_directory)
            torch.save(model_to_save.state_dict(), output_model_file)

        print("Model weights saved in {}".format(output_model_file))
コード例 #27
0
    def train(self, model_path: Optional[str] = None):
        """
        Main training entry point.

        Args:
            model_path:
                (Optional) Local path to model if model to train has been instantiated from a local path
                If present, we will try reloading the optimizer/scheduler states from there.
        """
        train_dataloader = self.get_train_dataloader()
        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            num_train_epochs = (self.args.max_steps //
                                (len(train_dataloader) //
                                 self.args.gradient_accumulation_steps) + 1)
        else:
            t_total = int(
                len(train_dataloader) //
                self.args.gradient_accumulation_steps *
                self.args.num_train_epochs)
            num_train_epochs = self.args.num_train_epochs

        optimizer, scheduler = self.get_optimizers(num_training_steps=t_total)

        # Check if saved optimizer or scheduler states exist
        if (model_path is not None
                and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
                and os.path.isfile(os.path.join(model_path, "scheduler.pt"))):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(os.path.join(model_path, "optimizer.pt"),
                           map_location=self.args.device))
            scheduler.load_state_dict(
                torch.load(os.path.join(model_path, "scheduler.pt")))

        model = self.model
        if self.args.fp16:
            if not is_apex_available():
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
            model, optimizer = amp.initialize(
                model, optimizer, opt_level=self.args.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Distributed training (should be after apex fp16 initialization)
        if self.args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[self.args.local_rank],
                output_device=self.args.local_rank,
                find_unused_parameters=True,
            )

        if self.tb_writer is not None:
            self.tb_writer.add_text("args", self.args.to_json_string())
            self.tb_writer.add_hparams(self.args.to_sanitized_dict(),
                                       metric_dict={})

        # Train!
        if is_torch_tpu_available():
            total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size(
            )
        else:
            total_train_batch_size = (self.args.train_batch_size *
                                      self.args.gradient_accumulation_steps *
                                      (torch.distributed.get_world_size()
                                       if self.args.local_rank != -1 else 1))
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", self.num_examples(train_dataloader))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per device = %d",
                    self.args.per_device_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            total_train_batch_size)
        logger.info("  Gradient Accumulation steps = %d",
                    self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        self.global_step = 0
        self.epoch = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        # Check if continuing training from a checkpoint
        if model_path is not None:
            # set global_step to global_step of last saved checkpoint from model path
            try:
                self.global_step = int(model_path.split("-")[-1].split("/")[0])
                epochs_trained = self.global_step // (
                    len(train_dataloader) //
                    self.args.gradient_accumulation_steps)
                steps_trained_in_current_epoch = self.global_step % (
                    len(train_dataloader) //
                    self.args.gradient_accumulation_steps)

                logger.info(
                    "  Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("  Continuing training from epoch %d",
                            epochs_trained)
                logger.info("  Continuing training from global step %d",
                            self.global_step)
                logger.info(
                    "  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)
            except ValueError:
                self.global_step = 0
                logger.info("  Starting fine-tuning.")

        tr_loss = 0.0
        logging_loss = 0.0
        model.zero_grad()
        train_iterator = trange(epochs_trained,
                                int(num_train_epochs),
                                desc="Epoch",
                                disable=not self.is_local_master())
        for epoch in train_iterator:
            if isinstance(train_dataloader, DataLoader) and isinstance(
                    train_dataloader.sampler, DistributedSampler):
                train_dataloader.sampler.set_epoch(epoch)

            if is_torch_tpu_available():
                parallel_loader = pl.ParallelLoader(
                    train_dataloader,
                    [self.args.device]).per_device_loader(self.args.device)
                epoch_iterator = tqdm(parallel_loader,
                                      desc="Iteration",
                                      disable=not self.is_local_master())
            else:
                epoch_iterator = tqdm(train_dataloader,
                                      desc="Iteration",
                                      disable=not self.is_local_master())

            for step, inputs in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                tr_loss += self._training_step(model, inputs, optimizer)

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                        # last step in epoch but step is always smaller than gradient_accumulation_steps
                        len(epoch_iterator) <=
                        self.args.gradient_accumulation_steps and
                    (step + 1) == len(epoch_iterator)):
                    if self.args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            self.args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       self.args.max_grad_norm)

                    if is_torch_tpu_available():
                        xm.optimizer_step(optimizer)
                    else:
                        optimizer.step()

                    scheduler.step()
                    model.zero_grad()
                    self.global_step += 1
                    self.epoch = epoch + (step + 1) / len(epoch_iterator)

                    if (self.args.logging_steps > 0
                            and self.global_step % self.args.logging_steps
                            == 0) or (self.global_step == 1
                                      and self.args.logging_first_step):
                        logs: Dict[str, float] = {}
                        logs["loss"] = (tr_loss -
                                        logging_loss) / self.args.logging_steps
                        # backward compatibility for pytorch schedulers
                        logs["learning_rate"] = (
                            scheduler.get_last_lr()[0]
                            if version.parse(torch.__version__) >=
                            version.parse("1.4") else scheduler.get_lr()[0])
                        logging_loss = tr_loss

                        self._log(logs)

                        if self.args.evaluate_during_training:
                            self.evaluate()

                    if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0:
                        # In all cases (even distributed/parallel), self.model is always a reference
                        # to the model we want to save.
                        if hasattr(model, "module"):
                            assert model.module is self.model
                        else:
                            assert model is self.model
                        # Save model checkpoint
                        output_dir = os.path.join(
                            self.args.output_dir,
                            f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")

                        self.save_model(output_dir)

                        if self.is_world_master():
                            self._rotate_checkpoints()

                        if is_torch_tpu_available():
                            xm.rendezvous("saving_optimizer_states")
                            xm.save(optimizer.state_dict(),
                                    os.path.join(output_dir, "optimizer.pt"))
                            xm.save(scheduler.state_dict(),
                                    os.path.join(output_dir, "scheduler.pt"))
                        elif self.is_world_master():
                            torch.save(
                                optimizer.state_dict(),
                                os.path.join(output_dir, "optimizer.pt"))
                            torch.save(
                                scheduler.state_dict(),
                                os.path.join(output_dir, "scheduler.pt"))

                if self.args.max_steps > 0 and self.global_step > self.args.max_steps:
                    epoch_iterator.close()
                    break
            if self.args.max_steps > 0 and self.global_step > self.args.max_steps:
                train_iterator.close()
                break
            if self.args.tpu_metrics_debug:
                # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
                xm.master_print(met.metrics_report())

        if self.tb_writer:
            self.tb_writer.close()

        logger.info(
            "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n"
        )
        return TrainOutput(self.global_step, tr_loss / self.global_step)
コード例 #28
0
def run(index):
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 16
    EPOCHS = 50
    dfx = pd.read_csv("/home/nizamphoenix/dataset/train.csv").fillna("none")
    df_train, df_valid = model_selection.train_test_split(dfx,
                                                          random_state=42,
                                                          test_size=0.3)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    sample = pd.read_csv("/home/nizamphoenix/dataset/sample_submission.csv")
    target_cols = list(sample.drop("qa_id", axis=1).columns)
    train_targets = df_train[target_cols].values
    valid_targets = df_valid[target_cols].values

    tokenizer = transformers.BertTokenizer.from_pretrained(
        "/home/nizamphoenix/bert-base-uncased/")

    train_dataset = BERTDatasetTraining(qtitle=df_train.question_title.values,
                                        qbody=df_train.question_body.values,
                                        answer=df_train.answer.values,
                                        targets=train_targets,
                                        tokenizer=tokenizer,
                                        max_len=MAX_LEN)

    train_sampler = torch.utils.data.DistributedSampler(
        train_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=True)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, sampler=train_sampler)

    valid_dataset = BERTDatasetTraining(qtitle=df_valid.question_title.values,
                                        qbody=df_valid.question_body.values,
                                        answer=df_valid.answer.values,
                                        targets=valid_targets,
                                        tokenizer=tokenizer,
                                        max_len=MAX_LEN)

    valid_sampler = torch.utils.data.DistributedSampler(
        valid_dataset,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
    )
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=8,  #can make changes here
        sampler=valid_sampler)

    device = xm.xla_device()

    lr = 2e-5 * xm.xrt_world_size()  #can make changes here
    num_train_steps = int(
        len(train_dataset) / TRAIN_BATCH_SIZE / xm.xrt_world_size() * EPOCHS)
    model = BERTBaseUncased("/home/nizamphoenix/bert-base-uncased/").to(device)
    optimizer = AdamW(model.parameters(), lr=lr,
                      eps=1e-8)  #eps = 1e-8: to prevent any division by zero
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    for epoch in range(EPOCHS):
        para_loader = pl.ParallelLoader(train_data_loader, [device])
        train_loop_fn(para_loader.per_device_loader(device), model, optimizer,
                      device, scheduler)

        para_loader = pl.ParallelLoader(valid_data_loader, [device])
        o, t = eval_loop_fn(para_loader.per_device_loader(device), model,
                            device)

        spear = []
        for jj in range(t.shape[1]):
            p1 = list(t[:, jj])
            p2 = list(o[:, jj])
            coef, _ = np.nan_to_num(stats.spearmanr(p1, p2))
            spear.append(coef)
        spear = np.mean(spear)
        xm.master_print(f"epoch={epoch},spearman={spear}")
        xm.save(model.state_dict(), "model3.bin")  #change every time
コード例 #29
0
def train_loop(folds, fold):

    if CFG.device == 'GPU':
        LOGGER.info(f"========== fold: {fold} training ==========")
    elif CFG.device == 'TPU':
        if CFG.nprocs == 1:
            LOGGER.info(f"========== fold: {fold} training ==========")
        elif CFG.nprocs == 8:
            xm.master_print(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    train_folds = folds.loc[trn_idx].reset_index(drop=True)
    valid_folds = folds.loc[val_idx].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(train_folds,
                                 transform=get_transforms(data='train'))
    valid_dataset = TrainDataset(valid_folds,
                                 transform=get_transforms(data='valid'))

    if CFG.device == 'GPU':
        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,
                                  num_workers=CFG.num_workers,
                                  pin_memory=True,
                                  drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size * 2,
                                  shuffle=False,
                                  num_workers=CFG.num_workers,
                                  pin_memory=True,
                                  drop_last=False)

    elif CFG.device == 'TPU':
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=CFG.batch_size,
                                                   sampler=train_sampler,
                                                   drop_last=True,
                                                   num_workers=CFG.num_workers)

        valid_sampler = torch.utils.data.distributed.DistributedSampler(
            valid_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)
        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=CFG.batch_size *
                                                   2,
                                                   sampler=valid_sampler,
                                                   drop_last=False,
                                                   num_workers=CFG.num_workers)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(optimizer):
        if CFG.scheduler == 'ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer,
                                          mode='min',
                                          factor=CFG.factor,
                                          patience=CFG.patience,
                                          verbose=True,
                                          eps=CFG.eps)
        elif CFG.scheduler == 'CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer,
                                          T_max=CFG.T_max,
                                          eta_min=CFG.min_lr,
                                          last_epoch=-1)
        elif CFG.scheduler == 'CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer,
                                                    T_0=CFG.T_0,
                                                    T_mult=1,
                                                    eta_min=CFG.min_lr,
                                                    last_epoch=-1)
        return scheduler

    # ====================================================
    # model & optimizer
    # ====================================================
    if CFG.device == 'TPU':
        device = xm.xla_device()
    elif CFG.device == 'GPU':
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = CustomResNet200D_WLF(CFG.model_name, pretrained=False)
    model.load_state_dict(
        torch.load(CFG.student, map_location=torch.device('cpu'))['model'])
    model.to(device)

    optimizer = Adam(model.parameters(),
                     lr=CFG.lr,
                     weight_decay=CFG.weight_decay,
                     amsgrad=False)
    scheduler = get_scheduler(optimizer)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()

    best_score = 0.
    best_loss = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        if CFG.device == 'TPU':
            if CFG.nprocs == 1:
                avg_loss = train_fn(train_loader, model, criterion, optimizer,
                                    epoch, scheduler, device)
            elif CFG.nprocs == 8:
                para_train_loader = pl.ParallelLoader(train_loader, [device])
                avg_loss = train_fn(
                    para_train_loader.per_device_loader(device), model,
                    criterion, optimizer, epoch, scheduler, device)
        elif CFG.device == 'GPU':
            avg_loss = train_fn(train_loader, model, criterion, optimizer,
                                epoch, scheduler, device)

        # eval
        if CFG.device == 'TPU':
            if CFG.nprocs == 1:
                avg_val_loss, preds, _ = valid_fn(valid_loader, model,
                                                  criterion, device)
            elif CFG.nprocs == 8:
                para_valid_loader = pl.ParallelLoader(valid_loader, [device])
                avg_val_loss, preds, valid_labels = valid_fn(
                    para_valid_loader.per_device_loader(device), model,
                    criterion, device)
                preds = idist.all_gather(torch.tensor(preds)).to('cpu').numpy()
                valid_labels = idist.all_gather(
                    torch.tensor(valid_labels)).to('cpu').numpy()
        elif CFG.device == 'GPU':
            avg_val_loss, preds, _ = valid_fn(valid_loader, model, criterion,
                                              device)

        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        elif isinstance(scheduler, CosineAnnealingLR):
            scheduler.step()
        elif isinstance(scheduler, CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        score, scores = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        if CFG.device == 'GPU':
            LOGGER.info(
                f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
            )
            LOGGER.info(
                f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
            )
        elif CFG.device == 'TPU':
            if CFG.nprocs == 1:
                LOGGER.info(
                    f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
                )
                LOGGER.info(
                    f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
                )
            elif CFG.nprocs == 8:
                xm.master_print(
                    f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s'
                )
                xm.master_print(
                    f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {np.round(scores, decimals=4)}'
                )

        if score > best_score:
            best_score = score
            if CFG.device == 'GPU':
                LOGGER.info(
                    f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'
                )
                torch.save({
                    'model': model.state_dict(),
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth')
            elif CFG.device == 'TPU':
                if CFG.nprocs == 1:
                    LOGGER.info(
                        f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'
                    )
                elif CFG.nprocs == 8:
                    xm.master_print(
                        f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model'
                    )
                xm.save({
                    'model': model.state_dict(),
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth')

        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            if CFG.device == 'GPU':
                LOGGER.info(
                    f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model')
                torch.save({
                    'model': model.state_dict(),
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth')
            elif CFG.device == 'TPU':
                if CFG.nprocs == 1:
                    LOGGER.info(
                        f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model'
                    )
                elif CFG.nprocs == 8:
                    xm.master_print(
                        f'Epoch {epoch+1} - Save Best Loss: {best_loss:.4f} Model'
                    )
                xm.save({
                    'model': model.state_dict(),
                    'preds': preds
                }, OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_loss.pth')

        if CFG.nprocs != 8:
            check_point = torch.load(
                OUTPUT_DIR + f'{CFG.model_name}_fold{fold}_best_score.pth')
            for c in [f'pred_{c}' for c in CFG.target_cols]:
                valid_folds[c] = np.nan
            valid_folds[[f'pred_{c}'
                         for c in CFG.target_cols]] = check_point['preds']

    return valid_folds
コード例 #30
0
ファイル: checkpoint.py プロジェクト: vishalbelsare/pythia
 def save_func(self, *args):
     return xm.save(*args) if is_xla() else torch.save(*args)