def fit(
        self,
        train_dataset,
        learning_rate=5e-5,
        per_gpu_batch_size=8,
        num_epochs=1,
        recover_step=-1,
        recover_dir=None,
        save_model_to_dir=None,
        max_steps=-1,
        local_rank=-1,
        num_gpus=None,
        gpu_ids=None,
        gradient_accumulation_steps=1,
        weight_decay=0.01,
        adam_epsilon=1e-8,
        warmup_steps=0,
        fp16=False,
        fp16_opt_level="O1",
        max_grad_norm=1.0,
        verbose=True,
        seed=None,
        random_prob=0.1,
        keep_prob=0.1,
    ):
        """
        Method for model-fine tuning.

        Args:
            train_dataset (S2SAbsSumDataset): Training dataset.
            learning_rate (float, optional): Learning rate. Defaults to 5e-5.
            per_gpu_batch_size (int, optional): Number of samples in each batch per
                GPU. Defaults to 8.
            num_epochs (int, optional): Number of passes through the entire training
                dataset. Ignored if `max_steps` is set. Defaults to 1.
            recover_step (int, optional): Step number to resume model fine-tuning from,
                assuming the model was saved by `S2SAbstractiveSummarizer.save_model`
                and the name is in the format "model.{recover_step}.bin".
                Defaults to -1, which means start model fine-tuning from scratch.
            recover_dir (str, optional): Directory to load model from if recover_step is
                provided. Defaults to None.
            save_model_to_dir (str, optional): Directory to save the model to. Defaults
                to None and the fine-tuned model is not saved.
            max_steps (int, optional): Maximum number of training steps. Defaults to -1
                and the number of training steps is determined by  `num_epochs` and the
                length of `train_dataset`.
            local_rank (int, optional): Rank of the device in distributed training.
                Defaults to -1 which means non-distributed training.
            num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is
                provided. Defaults to None and all available GPUs are used.
            gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs
                used are determined by num_gpus.
            gradient_accumulation_steps (int, optional): Number of steps to accmumulate
                gradient before each back propagation and model parameters update.
                Defaults to 1.
            weight_decay (float, optional): Weight decay to apply after each parameter
                update. Defaults to 0.01.
            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
                Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps taken to increase learning
                rate from 0 to `learning rate`. Defaults to 0.
            fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex.
                Defaults to False.
            fp16_opt_level(str, optional): Apex AMP optimization level for fp16.
                One of in ['O0', 'O1', 'O2', and 'O3'].
                See https://nvidia.github.io/apex/amp.html"
                Defaults to "01"
            max_grad_norm (float, optional): Maximum gradient norm for gradient
                clipping. Defaults to 1.0.
            verbose (bool, optional): Whether to output training log. Defaults to True.
            seed (int, optional): Random seed for model initialization.
                Defaults to None.
            random_prob (float, optional): Probability to randomly replace a masked
                token. Defaults to 0.1.
            keep_prob (float, optional): Probability to keep no change for a masked
                token. Defaults to 0.1.

        """
        global_step = 0
        if recover_step > 0:
            model_recover_checkpoint = os.path.join(
                recover_dir, "model.{}.bin".format(recover_step))
            logger.info(" ** Recover model checkpoint in %s ** ",
                        model_recover_checkpoint)
            model_state_dict = torch.load(model_recover_checkpoint,
                                          map_location="cpu")
            optimizer_recover_checkpoint = os.path.join(
                recover_dir, "optim.{}.bin".format(recover_step))
            checkpoint_state_dict = torch.load(optimizer_recover_checkpoint,
                                               map_location="cpu")

            checkpoint_state_dict["model"] = model_state_dict
            global_step = recover_step
        else:
            checkpoint_state_dict = None

        device, num_gpus, amp = self.prepare_model_and_optimizer(
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
            fp16=fp16,
            fp16_opt_level=fp16_opt_level,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            checkpoint_state_dict=checkpoint_state_dict,
        )

        per_node_train_batch_size = (per_gpu_batch_size * max(1, num_gpus) *
                                     gradient_accumulation_steps)

        # actual batch size, i.e. number of samples between each parameter update
        batch_size = per_node_train_batch_size * (
            torch.distributed.get_world_size() if local_rank != -1 else 1)

        # max_steps is mainly used by the scheduler to determine the learning rate,
        # together with global_step
        if max_steps == -1:
            max_steps = max(num_epochs * len(train_dataset) // batch_size, 1)

        if max_steps <= global_step:
            logger.info(
                "Training is done. Please use a new dir or clean this dir!")

            return

        self.scheduler = Transformer.get_default_scheduler(
            optimizer=self.optimizer,
            warmup_steps=warmup_steps,
            num_training_steps=max_steps,
        )
        if recover_step > 0:
            self.scheduler.load_state_dict(
                checkpoint_state_dict["lr_scheduler"])

        train_dataset = Seq2seqDatasetForBert(
            features=train_dataset,
            max_source_len=self.max_source_seq_length,
            max_target_len=self.max_target_seq_length,
            vocab_size=self.tokenizer.vocab_size,
            cls_id=self.tokenizer.cls_token_id,
            sep_id=self.tokenizer.sep_token_id,
            pad_id=self.tokenizer.pad_token_id,
            mask_id=self.tokenizer.mask_token_id,
            random_prob=random_prob,
            keep_prob=keep_prob,
            num_training_instances=batch_size * max_steps,
            offset=batch_size * global_step,
        )

        # The training features are shuffled
        train_sampler = (SequentialSampler(train_dataset) if local_rank == -1
                         else DistributedSampler(train_dataset, shuffle=False))
        # batch_size of the dataloader is the number of samples to load each
        # iteration on each node
        train_dataloader = DataLoader(
            train_dataset,
            sampler=train_sampler,
            batch_size=per_node_train_batch_size //
            gradient_accumulation_steps,
            collate_fn=batch_list_to_batch_tensors,
        )

        global_step, _ = super().fine_tune(
            train_dataloader=train_dataloader,
            device=device,
            num_gpus=num_gpus,
            get_inputs=S2SAbsSumProcessor.get_inputs,
            max_steps=max_steps,
            global_step=global_step,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=self.optimizer,
            scheduler=self.scheduler,
            local_rank=local_rank,
            fp16=fp16,
            amp=amp,
            max_grad_norm=max_grad_norm,
            verbose=verbose,
            seed=seed,
        )

        if save_model_to_dir is not None and local_rank in [-1, 0]:
            self.save_model(save_model_to_dir, global_step, fp16)

        # release GPU memories
        self.model.cpu()
        torch.cuda.empty_cache()
    def fit(
        self,
        train_dataloader,
        num_epochs=1,
        max_steps=-1,
        gradient_accumulation_steps=1,
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        fp16=False,
        fp16_opt_level="O1",
        checkpoint_state_dict=None,
        verbose=True,
        seed=None,
    ):
        """
        Fine-tunes a pre-trained sequence classification model.

        Args:
            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            max_steps (int, optional): Total number of training steps.
                If set to a positive value, it overrides num_epochs.
                Otherwise, it's determined by the dataset length,
                gradient_accumulation_steps, and num_epochs.
                Defualts to -1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate
                before performing a backward/update pass.
                Default to 1.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used.
                If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Defaults to -1, which means non-distributed training.
            weight_decay (float, optional): Weight decay to apply after each
                parameter update.
                Defaults to 0.0.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
                Defaults to 5e-5.
            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
                Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps taken to increase learning
                rate from 0 to `learning rate`. Defaults to 0.
            fp16 (bool): Whether to use 16-bit mixed precision through Apex
                Defaults to False
            fp16_opt_level (str): Apex AMP optimization level for fp16.
                One of in ['O0', 'O1', 'O2', and 'O3']
                See https://nvidia.github.io/apex/amp.html"
                Defaults to "01"
            checkpoint_state_dict (dict): Checkpoint states of model and optimizer.
                If specified, the model and optimizer's parameters are loaded using
                checkpoint_state_dict["model"] and checkpoint_state_dict["optimizer"]
                Defaults to None.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
        """

        # init device and optimizer
        device, num_gpus, amp = self.prepare_model_and_optimizer(
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            fp16=fp16,
            fp16_opt_level=fp16_opt_level,
            checkpoint_state_dict=checkpoint_state_dict,
        )

        # compute the max number of training steps
        max_steps = compute_training_steps(
            dataloader=train_dataloader,
            num_epochs=num_epochs,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        # init scheduler
        scheduler = Transformer.get_default_scheduler(
            optimizer=self.optimizer,
            warmup_steps=warmup_steps,
            num_training_steps=max_steps)

        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=self.optimizer,
            scheduler=scheduler,
            fp16=fp16,
            amp=amp,
            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )
    def fit(
        self,
        train_dataloader,
        num_epochs=1,
        max_steps=-1,
        gradient_accumulation_steps=1,
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        verbose=True,
        seed=None,
    ):
        """
        Fine-tunes a pre-trained sequence classification model.

        Args:
            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            max_steps (int, optional): Total number of training steps.
                If set to a positive value, it overrides num_epochs.
                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
                Defualts to -1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate
                before performing a backward/update pass.
                Default to 1.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
            weight_decay (float, optional): Weight decay to apply after each parameter update.
                Defaults to 0.0.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
                5e-5.
            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
                to `learning rate`. Defaults to 0.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

        # init optimizer
        optimizer = Transformer.get_default_optimizer(self.model, weight_decay,
                                                      learning_rate,
                                                      adam_epsilon)

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            num_epochs=num_epochs,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        # init scheduler
        scheduler = Transformer.get_default_scheduler(
            optimizer=optimizer,
            warmup_steps=warmup_steps,
            num_training_steps=max_steps,
        )

        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
            scheduler=scheduler,
            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )