def fit(
        self,
        train_dataloader,
        num_epochs=1,
        max_steps=-1,
        gradient_accumulation_steps=1,
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        fp16=False,
        fp16_opt_level="O1",
        checkpoint_state_dict=None,
        verbose=True,
        seed=None,
    ):
        """
        Fine-tunes a pre-trained sequence classification model.

        Args:
            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            max_steps (int, optional): Total number of training steps.
                If set to a positive value, it overrides num_epochs.
                Otherwise, it's determined by the dataset length,
                gradient_accumulation_steps, and num_epochs.
                Defualts to -1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate
                before performing a backward/update pass.
                Default to 1.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used.
                If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Defaults to -1, which means non-distributed training.
            weight_decay (float, optional): Weight decay to apply after each
                parameter update.
                Defaults to 0.0.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
                Defaults to 5e-5.
            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
                Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps taken to increase learning
                rate from 0 to `learning rate`. Defaults to 0.
            fp16 (bool): Whether to use 16-bit mixed precision through Apex
                Defaults to False
            fp16_opt_level (str): Apex AMP optimization level for fp16.
                One of in ['O0', 'O1', 'O2', and 'O3']
                See https://nvidia.github.io/apex/amp.html"
                Defaults to "01"
            checkpoint_state_dict (dict): Checkpoint states of model and optimizer.
                If specified, the model and optimizer's parameters are loaded using
                checkpoint_state_dict["model"] and checkpoint_state_dict["optimizer"]
                Defaults to None.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
        """

        # init device and optimizer
        device, num_gpus, amp = self.prepare_model_and_optimizer(
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            fp16=fp16,
            fp16_opt_level=fp16_opt_level,
            checkpoint_state_dict=checkpoint_state_dict,
        )

        # compute the max number of training steps
        max_steps = compute_training_steps(
            dataloader=train_dataloader,
            num_epochs=num_epochs,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        # init scheduler
        scheduler = Transformer.get_default_scheduler(
            optimizer=self.optimizer,
            warmup_steps=warmup_steps,
            num_training_steps=max_steps)

        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=self.optimizer,
            scheduler=scheduler,
            fp16=fp16,
            amp=amp,
            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )
    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=4,
        local_rank=-1,
        max_steps=5e4,
        warmup_steps_bert=20000,
        warmup_steps_dec=10000,
        learning_rate_bert=0.002,
        learning_rate_dec=0.2,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=1,
        report_every=10,
        save_every=1000,
        verbose=True,
        seed=None,
        fp16=False,
        fp16_opt_level="O2",
        world_size=1,
        rank=0,
        validation_function=None,
        checkpoint=None,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (SummarizationDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use. If None, all
                available GPUs will be used. If set to 0 or GPUs are not available,
                CPU device will be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Local rank means the ranking of the current GPU device on the current
                node. Defaults to -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
            warmup_steps_bert (int, optional): Number of steps taken to increase
                learning rate from 0 to `learning_rate` for tuning the BERT encoder.
                Defaults to 2e4.
            warmup_steps_dec (int, optional): Number of steps taken to increase
                learning rate from 0 to `learning_rate` for tuning the decoder.
                Defaults to 1e4.
            learning_rate_bert (float, optional):  Learning rate of the optimizer
                for the encoder. Defaults to 0.002.
            learning_rate_dec (float, optional):  Learning rate of the optimizer
                for the decoder. Defaults to 0.2.
            optimization_method (string, optional): Optimization method used in fine
                tuning. Defaults to "adam".
            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
                Defaults to 0.
            beta1 (float, optional): The exponential decay rate for the first moment
                estimates. Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment
                estimates. This value should be set close to 1.0 on problems with
                a sparse gradient. Defaults to 0.99.
            decay_method (string, optional): learning rate decrease method.
                Default to 'noam'.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            report_every (int, optional): The interval by steps to print out the
                training log. Defaults to 10.
            save_every (int, optional): The interval by steps to save the finetuned 
                model. Defaults to 100.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
            fp16 (bool, optional): Whether to use mixed precision training.
                Defaults to False.
            fp16_opt_level (str, optional): optimization level, refer to
                 https://nvidia.github.io/apex/amp.html#opt-levels for details.
                 Value choices are: "O0", "O1", "O2", "O3". Defaults to "O2".
            world_size (int, optional): Total number of GPUs that will be used.
                Defaults to 1.
            rank (int, optional): Global rank of the current GPU in distributed
                training. It's calculated with the rank of the current node in the
                cluster/world and the `local_rank` of the device in the current node.
                See an example in :file: `examples/text_summarization/
                abstractive_summarization_bertsum_cnndm_distributed_train.py`.
                Defaults to 0.
            validation_function (function, optional): function used in fitting to
                validate the performance. Default to None.
            checkpoint (str, optional): file path for a checkpoint based on which the
                training continues. Default to None.
        """

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)
        # move model to devices
        print("device is {}".format(device))
        if checkpoint:
            checkpoint = torch.load(checkpoint, map_location="cpu")
            self.model.load_checkpoint(checkpoint["model"])
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        self.optim_bert = model_builder.build_optim_bert(
            self.model,
            optim=optimization_method,
            lr_bert=learning_rate_bert,
            warmup_steps_bert=warmup_steps_bert,
            max_grad_norm=max_grad_norm,
            beta1=beta1,
            beta2=beta2,
        )
        self.optim_dec = model_builder.build_optim_dec(
            self.model,
            optim=optimization_method,
            lr_dec=learning_rate_dec,
            warmup_steps_dec=warmup_steps_dec,
            max_grad_norm=max_grad_norm,
            beta1=beta1,
            beta2=beta2,
        )

        optimizers = [self.optim_bert, self.optim_dec]

        self.amp = get_amp(fp16)
        if self.amp:
            self.model, optim = self.amp.initialize(self.model,
                                                    optimizers,
                                                    opt_level=fp16_opt_level)

        global_step = 0
        if checkpoint:
            if checkpoint["optimizers"]:
                for i in range(len(optimizers)):
                    model_builder.load_optimizer_checkpoint(
                        optimizers[i], checkpoint["optimizers"][i])
            if self.amp and "amp" in checkpoint and checkpoint["amp"]:
                self.amp.load_state_dict(checkpoint["amp"])
            if "global_step" in checkpoint and checkpoint["global_step"]:
                global_step = checkpoint["global_step"] / world_size
                print("global_step is {}".format(global_step))

        self.model = parallelize_model(model=self.model,
                                       device=device,
                                       num_gpus=num_gpus,
                                       gpu_ids=gpu_ids,
                                       local_rank=local_rank,
                                       apex=self.amp)

        if local_rank == -1:
            sampler = RandomSampler(train_dataset)
        else:
            sampler = DistributedSampler(train_dataset,
                                         num_replicas=world_size,
                                         rank=rank)

        def collate_fn(data):
            return self.processor.collate(data,
                                          block_size=self.max_pos_length,
                                          device=device)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=sampler,
                                      batch_size=batch_size,
                                      collate_fn=collate_fn)

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=BertSumAbsProcessor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            global_step=global_step,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            save_every=save_every,
            clip_grad_norm=False,
            optimizer=optimizers,
            scheduler=None,
            fp16=fp16,
            amp=self.amp,
            validation_function=validation_function,
        )

        # release GPU memories
        self.model.cpu()
        torch.cuda.empty_cache()

        self.save_model(max_steps)
Example #3
0
    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=3000,
        local_rank=-1,
        max_steps=5e5,
        warmup_steps=1e5,
        learning_rate=2e-3,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=1,
        report_every=50,
        verbose=True,
        seed=None,
        save_every=-1,
        world_size=1,
        rank=0,
        use_preprocessed_data=False,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (ExtSumProcessedIterableDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. If set to 0 or GPUs are not
                available, CPU device will be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Defaults to -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps.
                Defaults to 5e5.
            warmup_steps (int, optional): Number of steps taken to increase learning
                rate from 0 to `learning_rate`. Defaults to 1e5.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
                Defaults to 5e-5.
            optimization_method (string, optional): Optimization method used in
                fine tuning.
            max_grad_norm (float, optional): Maximum gradient norm for gradient
                clipping.
                Defaults to 0.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            decay_method (string, optional): learning rate decrease method.
                Defaulta to 'noam'.
            report_every (int, optional): The interval by steps to print out the
                trainint log.
                Defaults to 50.
            beta1 (float, optional): The exponential decay rate for the first moment
                estimates.
                Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment
                estimates.
                This value should be set close to 1.0 on problems with a sparse
                gradient.
                Defaults to 0.99.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
            rank (int, optional): Global rank of the current GPU in distributed
                training. It's calculated with the rank of the current node in
                the cluster/world and the `local_rank` of the device in the current
                node. See an example in :file: `examples/text_summarization/
                extractive_summarization_cnndm_distributed_train.py`.
                Defaults to 0.
        """

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)
        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        optimizer = model_builder.build_optim(
            self.model,
            optimization_method,
            learning_rate,
            max_grad_norm,
            beta1,
            beta2,
            decay_method,
            warmup_steps,
        )
        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        # batch_size is the number of tokens in a batch
        if use_preprocessed_data:
            train_dataloader = get_dataloader(
                train_dataset.get_stream(),
                is_labeled=True,
                batch_size=batch_size,
                world_size=world_size,
                rank=rank,
                local_rank=local_rank,
            )
        else:
            if local_rank == -1:
                sampler = RandomSampler(train_dataset)
            else:
                sampler = DistributedSampler(train_dataset,
                                             num_replicas=world_size,
                                             rank=rank)

            def collate_fn(data):
                return self.processor.collate(data,
                                              block_size=self.max_pos_length,
                                              device=device)

            train_dataloader = DataLoader(
                train_dataset,
                sampler=sampler,
                batch_size=batch_size,
                collate_fn=collate_fn,
            )

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=ExtSumProcessor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
            scheduler=None,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            clip_grad_norm=False,
            save_every=save_every,
        )
Example #4
0
    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=3000,
        local_rank=-1,
        max_steps=5e5,
        warmup_steps=1e5,
        learning_rate=2e-3,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=2,
        report_every=50,
        verbose=True,
        seed=None,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (ExtSumProcessedIterableDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will
                be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
                to `learning_rate`. Defaults to 1e5.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
                5e-5.
            optimization_method (string, optional): Optimization method used in fine tuning.
            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
                Defaults to 0.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            decay_method (string, optional): learning rate decrease method. Default to 'noam'.
            report_every (int, optional): The interval by steps to print out the trainint log.
                Defaults to 50.
            beta1 (float, optional): The exponential decay rate for the first moment estimates.
                Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment estimates.
                This value should be set close to 1.0 on problems with a sparse gradient.
                Defaults to 0.99.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

        # init optimizer
        optimizer = model_builder.build_optim(
            optimization_method,
            learning_rate,
            max_grad_norm,
            beta1,
            beta2,
            decay_method,
            warmup_steps,
            self.model,
            None,
        )

        # batch_size is the number of tokens in a batch
        train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=ExtSumProcessor.get_inputs,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            max_steps=max_steps,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
            scheduler=None,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            clip_grad_norm=False,
        )
    def fit(
        self,
        train_dataloader,
        num_epochs=1,
        max_steps=-1,
        gradient_accumulation_steps=1,
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        verbose=True,
        seed=None,
    ):
        """
        Fine-tunes a pre-trained sequence classification model.

        Args:
            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            max_steps (int, optional): Total number of training steps.
                If set to a positive value, it overrides num_epochs.
                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
                Defualts to -1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate
                before performing a backward/update pass.
                Default to 1.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
            weight_decay (float, optional): Weight decay to apply after each parameter update.
                Defaults to 0.0.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
                5e-5.
            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
                to `learning rate`. Defaults to 0.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

        # init optimizer
        optimizer = Transformer.get_default_optimizer(self.model, weight_decay,
                                                      learning_rate,
                                                      adam_epsilon)

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            num_epochs=num_epochs,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        # init scheduler
        scheduler = Transformer.get_default_scheduler(
            optimizer=optimizer,
            warmup_steps=warmup_steps,
            num_training_steps=max_steps,
        )

        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
            scheduler=scheduler,
            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )