def test_get_device_gpu():
    device, gpus = get_device(num_gpus=1)
    assert isinstance(device, torch.device)
    assert device.type == "cuda"
    assert gpus == 1

    device, gpus = get_device(gpu_ids=[0])
    assert device.type == "cuda"
    assert gpus == 1
def test_get_device_cpu():
    device, gpus = get_device(num_gpus=0)
    assert isinstance(device, torch.device)
    assert device.type == "cpu"
    assert gpus == 0

    device, gpus = get_device(gpu_ids=[])
    assert device.type == "cpu"
    assert gpus == 0
Ejemplo n.º 3
0
    def predict(self, eval_dataloader, num_gpus=1, verbose=True):
        """
        Scores a dataset using a fine-tuned model and a given dataloader.

        Args:
            eval_dataloader (Dataloader): Dataloader for the evaluation data.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.
        
        Returns
            1darray: numpy array of predicted label indices.
        """
        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
        if isinstance(self.model, nn.DataParallel):
            self.model.module.to(device)
        else:
            self.model.to(device)

        preds = list(
            super().predict(
                eval_dataloader=eval_dataloader,
                get_inputs=Processor.get_inputs,
                device=device,
                verbose=verbose,
            )
        )
        preds = np.concatenate(preds)
        # todo generator & probs
        return np.argmax(preds, axis=1)
Ejemplo n.º 4
0
    def predict(self,
                eval_dataloader,
                get_inputs,
                num_gpus,
                gpu_ids,
                verbose=True):
        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=-1)

        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # parallelize model
        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=-1,
        )

        # predict
        self.model.eval()
        for batch in tqdm(eval_dataloader, desc="Scoring",
                          disable=not verbose):
            with torch.no_grad():
                inputs = get_inputs(batch,
                                    device,
                                    self.model_name,
                                    train_mode=False)
                outputs = self.model(**inputs)
                logits = outputs[0]
            yield logits.detach().cpu().numpy()
Ejemplo n.º 5
0
    def prepare_model_and_optimizer(
        self,
        num_gpus,
        gpu_ids,
        local_rank,
        weight_decay,
        learning_rate,
        adam_epsilon,
        fp16=False,
        fp16_opt_level="O1",
        checkpoint_state_dict=None,
    ):
        """
        This function initializes an optimizer and moves the model to a device.
        It can be used by most child classes before calling fine_tune.
        Child classes that require custom optimizers need to either override this
            function or implement the steps listed below in the specified order
            before fine-tuning.

        The steps are performed in the following order:
            1. Move model to device
            2. Create optimizer
            3. Initialize amp
            4. Parallelize model
        """

        amp = get_amp(fp16)

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        self.optimizer = Transformer.get_default_optimizer(
            self.model, weight_decay, learning_rate, adam_epsilon)

        if fp16 and amp:
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, opt_level=fp16_opt_level)

        if checkpoint_state_dict:
            self.optimizer.load_state_dict(checkpoint_state_dict["optimizer"])
            self.model.load_state_dict(checkpoint_state_dict["model"])

            if fp16 and amp:
                amp.load_state_dict(checkpoint_state_dict["amp"])

        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        return device, num_gpus, amp
Ejemplo n.º 6
0
    def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True):
        """
        Scores a dataset using a fine-tuned model and a given dataloader.

        Args:
            test_dataloader (Dataloader): Dataloader for scoring the data.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.

        Returns
            1darray: numpy array of predicted sentence scores.
        """

        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)

        preds = list(
            super().predict(
                eval_dataloader=test_dataloader,
                get_inputs=ExtSumProcessor.get_inputs,
                num_gpus=num_gpus,
                gpu_ids=gpu_ids,
                verbose=verbose,
            )
        )
        return preds
Ejemplo n.º 7
0
    def get_hidden_states(self, text, batch_size=32):
        """Extract the hidden states from the pretrained model
        
        Args:
            text: List of documents to extract features from.
            batch_size: Batch size, defaults to 32.
        
        Returns:
            pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). 
        """
        device = get_device("cpu" if self.num_gpus == 0 or self.cuda else "gpu")
        self.model = move_to_device(self.model, device, self.num_gpus)

        self.model.eval()

        tokens = self.tokenizer.tokenize(text)

        tokens, input_ids, input_mask, input_type_ids = self.tokenizer.preprocess_encoder_tokens(
            tokens, max_len=self.max_len
        )

        input_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
        input_mask = torch.tensor(input_mask, dtype=torch.long, device=device)
        input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)

        eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
        eval_dataloader = DataLoader(
            eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size
        )

        hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
        for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
            with torch.no_grad():
                all_encoder_layers, _ = self.model(
                    input_ids_tensor, token_type_ids=None, attention_mask=input_mask_tensor
                )
                self.embedding_dim = all_encoder_layers[0].size()[-1]

            for b, example_index in enumerate(example_indices_tensor):
                for (i, token) in enumerate(tokens[example_index.item()]):
                    for (j, layer_index) in enumerate(self.layer_index):
                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        hidden_states["text_index"].append(example_index.item())
                        hidden_states["token"].append(token)
                        hidden_states["layer_index"].append(layer_index)
                        hidden_states["values"].append(
                            [round(x.item(), 6) for x in layer_output[i]]
                        )

            # empty cache
            del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
            torch.cuda.empty_cache()

        # empty cache
        del [input_ids, input_mask, input_type_ids]
        torch.cuda.empty_cache()

        return pd.DataFrame.from_dict(hidden_states)
Ejemplo n.º 8
0
    def fit(
        self,
        train_dataloader,
        num_epochs=1,
        num_gpus=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        verbose=True,
        seed=None,
    ):
        """
        Fit the TokenClassifier model using the given training dataset.

        Args:
            train_dataloader (DataLoader): DataLoader instance for training.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will
                be used. Defaults to None.
            local_rank (int, optional): Whether need to do distributed training.
                Defaults to -1, no distributed training.
            weight_decay (float, optional): Weight decay rate.
                Defaults to 0.
            learning_rate (float, optional): The learning rate.
                Defaults to 5e-5.
            adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer.
                Defaults to 1e-8.
            warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'.
                Defaults to 0.
            verbose (bool, optional): Verbose model.
                Defaults to False.
            seed (int, optional): The seed for the transformers.
                Defaults to None, use the default seed.
        """

        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
        if isinstance(self.model, nn.DataParallel):
            self.model.module.to(device)
        else:
            self.model.to(device)

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=TokenClassificationProcessor.get_inputs,
            device=device,
            n_gpu=num_gpus,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            warmup_steps=warmup_steps,
            verbose=verbose,
            seed=seed,
        )
Ejemplo n.º 9
0
    def fit(
        self,
        train_dataloader,
        num_epochs=1,
        num_gpus=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        verbose=True,
        seed=None,
    ):
        """
        Fine-tunes a pre-trained sequence classification model.

        Args:
            train_dataloader (Dataloader): Dataloader for the training data.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
            weight_decay (float, optional): Weight decay to apply after each parameter update.
                Defaults to 0.0.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
                5e-5.
            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
                to `learning rate`. Defaults to 0.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
        if isinstance(self.model, nn.DataParallel):
            self.model.module.to(device)
        else:
            self.model.to(device)

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
            device=device,
            n_gpu=num_gpus,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            warmup_steps=warmup_steps,
            verbose=verbose,
            seed=seed,
        )
    def predict(self, eval_dataloader, num_gpus=1, verbose=True):
        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
        if isinstance(self.model, nn.DataParallel):
            self.model.module.to(device)
        else:
            self.model.to(device)

        preds = list(super().predict(
            eval_dataloader=eval_dataloader,
            get_inputs=Processor.get_inputs,
            device=device,
            verbose=verbose,
        ))
        preds = np.concatenate(preds)
        # todo generator & probs
        return np.argmax(preds, axis=1)
Ejemplo n.º 11
0
    def predict(
        self,
        eval_dataloader,
        num_gpus=None,
        verbose=True
    ):
        """
        Test on an evaluation dataset and get the token label predictions.

        Args:
            eval_dataset (TensorDataset): A TensorDataset for evaluation.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will
                be used. Defaults to None.
            verbose (bool, optional): Verbose model.
                Defaults to False.

        Returns:
            ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is
            [number_of_examples, sequence_length, number_of_labels]. Each
            value in the ndarray is not normalized. Post-process will be needed
            to get the probability for each class label.
        """

        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
        if isinstance(self.model, nn.DataParallel):
            self.model.module.to(device)
        else:
            self.model.to(device)
        
        preds = list(
            super().predict(
                eval_dataloader=eval_dataloader,
                get_inputs=TokenClassificationProcessor.get_inputs,
                device=device,
                verbose=verbose
            )
        )
        preds_np = np.concatenate(preds)
        return preds_np
Ejemplo n.º 12
0
    def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True):
        device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)

        if isinstance(self.model, torch.nn.DataParallel):
            self.model = self.model.module

        if num_gpus > 1:
            self.model = torch.nn.DataParallel(self.model,
                                               device_ids=list(
                                                   range(num_gpus)))

        self.model.to(device)
        self.model.eval()

        for batch in tqdm(eval_dataloader,
                          desc="Evaluating",
                          disable=not verbose):
            batch = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                inputs = get_inputs(batch, self.model_name, train_mode=False)
                outputs = self.model(**inputs)
                logits = outputs[0]
            yield logits.detach().cpu().numpy()
Ejemplo n.º 13
0
    def fit(
        self,
        train_dataloader,
        num_epochs=1,
        num_gpus=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        verbose=True,
        seed=None,
    ):
        """
        Fine-tunes a pre-trained sequence classification model.
        """

        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
        if isinstance(self.model, nn.DataParallel):
            self.model.module.to(device)
        else:
            self.model.to(device)

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
            device=device,
            n_gpu=num_gpus,
            num_train_epochs=num_epochs,
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            adam_epsilon=adam_epsilon,
            warmup_steps=warmup_steps,
            verbose=verbose,
            seed=seed,
        )
    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=4,
        local_rank=-1,
        max_steps=5e4,
        warmup_steps_bert=20000,
        warmup_steps_dec=10000,
        learning_rate_bert=0.002,
        learning_rate_dec=0.2,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=1,
        report_every=10,
        save_every=1000,
        verbose=True,
        seed=None,
        fp16=False,
        fp16_opt_level="O2",
        world_size=1,
        rank=0,
        validation_function=None,
        checkpoint=None,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (SummarizationDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use. If None, all
                available GPUs will be used. If set to 0 or GPUs are not available,
                CPU device will be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Local rank means the ranking of the current GPU device on the current
                node. Defaults to -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
            warmup_steps_bert (int, optional): Number of steps taken to increase
                learning rate from 0 to `learning_rate` for tuning the BERT encoder.
                Defaults to 2e4.
            warmup_steps_dec (int, optional): Number of steps taken to increase
                learning rate from 0 to `learning_rate` for tuning the decoder.
                Defaults to 1e4.
            learning_rate_bert (float, optional):  Learning rate of the optimizer
                for the encoder. Defaults to 0.002.
            learning_rate_dec (float, optional):  Learning rate of the optimizer
                for the decoder. Defaults to 0.2.
            optimization_method (string, optional): Optimization method used in fine
                tuning. Defaults to "adam".
            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
                Defaults to 0.
            beta1 (float, optional): The exponential decay rate for the first moment
                estimates. Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment
                estimates. This value should be set close to 1.0 on problems with
                a sparse gradient. Defaults to 0.99.
            decay_method (string, optional): learning rate decrease method.
                Default to 'noam'.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            report_every (int, optional): The interval by steps to print out the
                training log. Defaults to 10.
            save_every (int, optional): The interval by steps to save the finetuned 
                model. Defaults to 100.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
            fp16 (bool, optional): Whether to use mixed precision training.
                Defaults to False.
            fp16_opt_level (str, optional): optimization level, refer to
                 https://nvidia.github.io/apex/amp.html#opt-levels for details.
                 Value choices are: "O0", "O1", "O2", "O3". Defaults to "O2".
            world_size (int, optional): Total number of GPUs that will be used.
                Defaults to 1.
            rank (int, optional): Global rank of the current GPU in distributed
                training. It's calculated with the rank of the current node in the
                cluster/world and the `local_rank` of the device in the current node.
                See an example in :file: `examples/text_summarization/
                abstractive_summarization_bertsum_cnndm_distributed_train.py`.
                Defaults to 0.
            validation_function (function, optional): function used in fitting to
                validate the performance. Default to None.
            checkpoint (str, optional): file path for a checkpoint based on which the
                training continues. Default to None.
        """

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)
        # move model to devices
        print("device is {}".format(device))
        if checkpoint:
            checkpoint = torch.load(checkpoint, map_location="cpu")
            self.model.load_checkpoint(checkpoint["model"])
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        self.optim_bert = model_builder.build_optim_bert(
            self.model,
            optim=optimization_method,
            lr_bert=learning_rate_bert,
            warmup_steps_bert=warmup_steps_bert,
            max_grad_norm=max_grad_norm,
            beta1=beta1,
            beta2=beta2,
        )
        self.optim_dec = model_builder.build_optim_dec(
            self.model,
            optim=optimization_method,
            lr_dec=learning_rate_dec,
            warmup_steps_dec=warmup_steps_dec,
            max_grad_norm=max_grad_norm,
            beta1=beta1,
            beta2=beta2,
        )

        optimizers = [self.optim_bert, self.optim_dec]

        self.amp = get_amp(fp16)
        if self.amp:
            self.model, optim = self.amp.initialize(self.model,
                                                    optimizers,
                                                    opt_level=fp16_opt_level)

        global_step = 0
        if checkpoint:
            if checkpoint["optimizers"]:
                for i in range(len(optimizers)):
                    model_builder.load_optimizer_checkpoint(
                        optimizers[i], checkpoint["optimizers"][i])
            if self.amp and "amp" in checkpoint and checkpoint["amp"]:
                self.amp.load_state_dict(checkpoint["amp"])
            if "global_step" in checkpoint and checkpoint["global_step"]:
                global_step = checkpoint["global_step"] / world_size
                print("global_step is {}".format(global_step))

        self.model = parallelize_model(model=self.model,
                                       device=device,
                                       num_gpus=num_gpus,
                                       gpu_ids=gpu_ids,
                                       local_rank=local_rank,
                                       apex=self.amp)

        if local_rank == -1:
            sampler = RandomSampler(train_dataset)
        else:
            sampler = DistributedSampler(train_dataset,
                                         num_replicas=world_size,
                                         rank=rank)

        def collate_fn(data):
            return self.processor.collate(data,
                                          block_size=self.max_pos_length,
                                          device=device)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=sampler,
                                      batch_size=batch_size,
                                      collate_fn=collate_fn)

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=BertSumAbsProcessor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            global_step=global_step,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            save_every=save_every,
            clip_grad_norm=False,
            optimizer=optimizers,
            scheduler=None,
            fp16=fp16,
            amp=self.amp,
            validation_function=validation_function,
        )

        # release GPU memories
        self.model.cpu()
        torch.cuda.empty_cache()

        self.save_model(max_steps)
Ejemplo n.º 15
0
    def predict(self,
                token_ids,
                input_mask,
                labels=None,
                batch_size=32,
                num_gpus=None,
                probabilities=False):
        """
        Predict token labels on the testing data.

        Args:
            token_ids (list): List of lists. Each sublist contains
                numerical token ids corresponding to the tokens in the input
                text data.
            input_mask (list): List of lists. Each sublist contains
                the attention mask of the input token list, 1 for input
                tokens and 0 for padded tokens, so that padded tokens are
                not attended to.
            labels (list, optional): List of lists. Each sublist contains
                numerical token labels of an input sentence/paragraph.
                If provided, it's used to compute the evaluation loss.
                Default value is None.
            batch_size (int, optional): Testing batch size. Defaults to 32.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. Defaults to None.

        Returns:
            list or namedtuple(list, ndarray): List of lists of predicted
                token labels or ([token labels], probabilities) if
                probabilities is True. The probabilities output is an n x m
                array, where n is the size of the testing data and m is the
                number of tokens in each input sublist. The probability
                values are the softmax probability of the predicted class.
        """
        test_dataloader = create_data_loader(
            input_ids=token_ids,
            input_mask=input_mask,
            label_ids=labels,
            batch_size=batch_size,
            sample_method="sequential",
        )
        device, num_gpus = get_device(num_gpus)

        self.model = move_to_device(self.model, device, num_gpus)

        self.model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        for step, batch in enumerate(
                tqdm(test_dataloader, desc="Iteration", mininterval=10)):
            batch = tuple(t.to(device) for t in batch)
            true_label_available = False
            if labels:
                b_input_ids, b_input_mask, b_labels = batch
                true_label_available = True
            else:
                b_input_ids, b_input_mask = batch

            with torch.no_grad():
                logits = self.model(b_input_ids, attention_mask=b_input_mask)
                if true_label_available:
                    active_loss = b_input_mask.view(-1) == 1
                    active_logits = logits.view(-1,
                                                self.num_labels)[active_loss]
                    active_labels = b_labels.view(-1)[active_loss]
                    loss_fct = nn.CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(active_logits, active_labels)

                    eval_loss += tmp_eval_loss.mean().item()

            logits = logits.detach().cpu()

            if step == 0:
                logits_all = logits.numpy()
            else:
                logits_all = np.append(logits_all, logits, axis=0)

            nb_eval_steps += 1

        predictions = [list(p) for p in np.argmax(logits_all, axis=2)]

        if true_label_available:
            validation_loss = eval_loss / nb_eval_steps
            print("Evaluation loss: {}".format(validation_loss))

        if probabilities:
            return namedtuple("Predictions", "classes probabilities")(
                predictions,
                np.max(nn.Softmax(dim=2)(torch.Tensor(logits_all)).numpy(), 2))
        else:
            return predictions
Ejemplo n.º 16
0
    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        num_gpus=None,
        num_epochs=1,
        batch_size=32,
        learning_rate=2e-5,
        warmup_proportion=None,
    ):
        """
        Fine-tunes the BERT classifier using the given training data.

        Args:
            token_ids (list): List of lists. Each sublist contains
                numerical token ids corresponding to the tokens in the input
                text data.
            input_mask (list): List of lists. Each sublist contains
                the attention mask of the input token id list. 1 for input
                tokens and 0 for padded tokens, so that padded tokens are
                not attended to.
            labels (list): List of lists, each sublist contains numerical
                token labels of an input sentence/paragraph.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 32.
            learning_rate (float, optional): learning rate of the BertAdam
                optimizer. Defaults to 2e-5.
            warmup_proportion (float, optional): Proportion of training to
                perform linear learning rate warmup for. E.g., 0.1 = 10% of
                training. Defaults to None.
        """

        train_dataloader = create_data_loader(
            input_ids=token_ids,
            input_mask=input_mask,
            label_ids=labels,
            sample_method="random",
            batch_size=batch_size,
        )

        device, num_gpus = get_device(num_gpus)

        self.model = move_to_device(self.model, device, num_gpus)

        if num_gpus is None:
            num_gpus_used = torch.cuda.device_count()
        else:
            num_gpus_used = min(num_gpus, torch.cuda.device_count())

        num_train_optimization_steps = max(
            (int(len(token_ids) / batch_size) * num_epochs), 1)
        optimizer = self._get_optimizer(
            learning_rate=learning_rate,
            num_train_optimization_steps=num_train_optimization_steps,
            warmup_proportion=warmup_proportion,
        )

        self.model.train()
        for _ in trange(int(num_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", mininterval=30)):
                batch = tuple(t.to(device) for t in batch)
                b_token_ids, b_input_mask, b_label_ids = batch

                loss = self.model(input_ids=b_token_ids,
                                  attention_mask=b_input_mask,
                                  labels=b_label_ids)

                if num_gpus_used > 1:
                    # mean() to average on multi-gpu.
                    loss = loss.mean()
                # Accumulate parameter gradients
                loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1

                # Update parameters based on current gradients
                optimizer.step()
                # Reset parameter gradients to zero
                optimizer.zero_grad()

            train_loss = tr_loss / nb_tr_steps
            print("Train loss: {}".format(train_loss))

            torch.cuda.empty_cache()
Ejemplo n.º 17
0
    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        val_token_ids,
        val_input_mask,
        val_labels,
        token_type_ids=None,
        val_token_type_ids=None,
        verbose=True,
        logging_steps=0,
        save_steps=0,
        val_steps=0,
    ):
        """Fine-tunes the XLNet classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device, num_gpus = get_device(self.num_gpus)
        self.model = move_to_device(self.model, device, self.num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long)
        val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long)
        val_labels_tensor = torch.tensor(val_labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
            val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)

            train_dataset = TensorDataset(
                token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
            )

            val_dataset = TensorDataset(
                val_token_ids_tensor,
                val_input_mask_tensor,
                val_token_type_ids_tensor,
                val_labels_tensor,
            )

        else:

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)

            val_dataset = TensorDataset(
                val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor
            )

        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

        val_sampler = RandomSampler(val_dataset)

        val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.batch_size)

        num_examples = len(token_ids)
        num_batches = int(np.ceil(num_examples / self.batch_size))
        num_train_optimization_steps = num_batches * self.num_epochs

        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr, eps=self.adam_eps)
        scheduler = WarmupLinearSchedule(
            optimizer, warmup_steps=self.warmup_steps, t_total=num_train_optimization_steps
        )

        global_step = 0
        self.model.train()
        optimizer.zero_grad()
        for epoch in range(self.num_epochs):

            train_sampler = RandomSampler(train_dataset)

            train_dataloader = DataLoader(
                train_dataset, sampler=train_sampler, batch_size=self.batch_size
            )

            tr_loss = 0.0
            logging_loss = 0.0
            val_loss = 0.0

            for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch
                    )
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)

                outputs = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=y_batch,
                )

                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers

                loss.sum().backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

                tr_loss += loss.sum().item()
                optimizer.step()
                # Update learning rate schedule
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                # logging of learning rate and loss
                if logging_steps > 0 and global_step % logging_steps == 0:
                    mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step)
                    mlflow.log_metric(
                        "training loss",
                        (tr_loss - logging_loss) / (logging_steps * self.batch_size),
                        step=global_step,
                    )
                    logging_loss = tr_loss
                # model checkpointing
                if save_steps > 0 and global_step % save_steps == 0:
                    checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
                    if not os.path.isdir(checkpoint_dir):
                        os.makedirs(checkpoint_dir)
                    checkpoint_path = checkpoint_dir + "/" + str(global_step) + ".pth"
                    torch.save(self.model.state_dict(), checkpoint_path)
                    mlflow.log_artifact(checkpoint_path)
                # model validation
                if val_steps > 0 and global_step % val_steps == 0:
                    # run model on validation set
                    self.model.eval()
                    val_loss = 0.0
                    for j, val_batch in enumerate(val_dataloader):
                        if token_type_ids:
                            val_x_batch, val_mask_batch, val_token_type_ids_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch
                            )
                        else:
                            token_type_ids_batch = None
                            val_x_batch, val_mask_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch
                            )
                        val_outputs = self.model(
                            input_ids=val_x_batch,
                            token_type_ids=val_token_type_ids_batch,
                            attention_mask=val_mask_batch,
                            labels=val_y_batch,
                        )
                        vloss = val_outputs[0]
                        val_loss += vloss.sum().item()
                    mlflow.log_metric(
                        "validation loss", val_loss / len(val_dataset), step=global_step
                    )
                    self.model.train()

                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        if val_loss > 0:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\
                                 average val loss:{:.6f}".format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10, num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                    val_loss / (j + 1),
                                )
                            )
                        else:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}".format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10, num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                )
                            )
        checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_path = checkpoint_dir + "/" + "final" + ".pth"
        torch.save(self.model.state_dict(), checkpoint_path)
        mlflow.log_artifact(checkpoint_path)
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        if val_steps > 0:
            del [val_x_batch, val_y_batch, val_mask_batch, val_token_type_ids_batch]
        torch.cuda.empty_cache()
Ejemplo n.º 18
0
    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=8,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 8.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """

        device, num_gpus = get_device(num_gpus)
        self.model = move_to_device(self.model, device, num_gpus)

        self.model.eval()
        preds = []

        with tqdm(total=len(token_ids)) as pbar:
            for i in range(0, len(token_ids), batch_size):
                start = i
                end = start + batch_size
                x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)
                mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)

                token_type_ids_batch = torch.tensor(
                    token_type_ids[start:end], dtype=torch.long, device=device
                )

                with torch.no_grad():
                    pred_batch = self.model(
                        input_ids=x_batch,
                        token_type_ids=token_type_ids_batch,
                        attention_mask=mask_batch,
                        labels=None,
                    )
                    preds.append(pred_batch[0].cpu())
                    if i % batch_size == 0:
                        pbar.update(batch_size)

            preds = np.concatenate(preds)

            if probabilities:
                return namedtuple("Predictions", "classes probabilities")(
                    preds.argmax(axis=1), nn.Softmax(dim=1)(torch.Tensor(preds)).numpy()
                )
            else:
                return preds.argmax(axis=1)
Ejemplo n.º 19
0
def test_get_device_local_rank():
    device, gpus = get_device(local_rank=0)
    assert isinstance(device, torch.device)
    assert device.type == "cuda"
    assert device.index == 0
    assert gpus == 1
Ejemplo n.º 20
0
    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=32,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 32.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
        self.model = move_model_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids,
                                                 dtype=torch.long)
            test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                         token_type_ids_tensor)
        else:
            test_dataset = TensorDataset(token_ids_tensor, input_mask_tensor)

        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset,
                                     sampler=test_sampler,
                                     batch_size=batch_size)

        preds = []
        for i, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
            if token_type_ids:
                x_batch, mask_batch, token_type_ids_batch = tuple(
                    t.to(device) for t in batch)
            else:
                token_type_ids_batch = None
                x_batch, mask_batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                p_batch = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
            preds.append(p_batch.cpu())

        preds = np.concatenate(preds)

        if probabilities:
            return namedtuple("Predictions", "classes probabilities")(
                preds.argmax(axis=1),
                nn.Softmax(dim=1)(torch.Tensor(preds)).numpy())
        else:
            return preds.argmax(axis=1)
    def predict(
        self,
        test_dataset,
        per_gpu_batch_size=4,
        max_tgt_length=64,
        beam_size=1,
        need_score_traces=False,
        length_penalty=0,
        forbid_duplicate_ngrams=True,
        forbid_ignore_word=".",
        s2s_config=S2SConfig(),
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        fp16=False,
        verbose=True,
    ):
        """
        Method for predicting, i.e. generating summaries.
        Args:
            test_dataset (S2SAbsSumDataset): Testing dataset.
            per_gpu_batch_size (int, optional): Number of testing samples in each
                batch per GPU. Defaults to 4.
            max_tgt_length (int, optional): Maximum number of tokens in output
                sequence. Defaults to 64.
            beam_size (int, optional): Beam size of beam search. Defaults to 1.
            need_score_traces (bool, optional): Whether to return score traces of
                beam search. Defaults to False.
            length_penalty (float, optional): Length penalty for beam search.
                Defaults to 0.
            forbid_duplicate_ngrams (bool, optional): Whether to forbid duplicate
                n-grams when generating output. Size of the n-gram is determined by
                `S2SConfig.ngram_size` which defaults to 3. Defaults to True.
            forbid_ignore_word (str, optional): Words to ignore when forbidding
                duplicate ngrams. Multiple words should be separated by "|", for
                example, ".|[X_SEP]". Defaults to ".".
            s2s_config (S2SConfig, optional): Some default decoding settings that
                the users usually don't need to change. Defaults to S2SConfig().
            num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is
                provided. Defaults to None and all available GPUs are used.
            gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs
                used are determined by num_gpus.
            local_rank (int, optional): Rank of the device in distributed training.
                Defaults to -1 which means non-distributed training.
            fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex.
                Defaults to False.
            verbose(bool, optional): Whether to output predicting log. Defaults to True.

        Returns:
            List or tuple of lists: List of generated summaries. If `need_score_traces`
                is True, also returns the score traces of beam search.

        """

        if need_score_traces and beam_size <= 1:
            raise ValueError(
                "Score trace is only available for beam search with beam size > 1."
            )
        if max_tgt_length >= self.max_seq_length - 2:
            raise ValueError("Maximum tgt length exceeds max seq length - 2.")

        # preprocessing pipeline
        if self._model_type == "roberta":
            is_roberta = True
            no_segment_embedding = True
            vocab = self.tokenizer.encoder
        else:
            is_roberta = False
            no_segment_embedding = False
            vocab = self.tokenizer.vocab

        if not self._model_name.startswith("unilm1.2"):
            if self._model_name.startswith(
                    "unilm-") or self._model_name.startswith("unilm1-"):
                new_segment_ids = True
            else:
                new_segment_ids = False
        else:
            new_segment_ids = False

        cls_token = "<s>" if is_roberta else "[CLS]"
        sep_token = "</s>" if is_roberta else "[SEP]"
        pad_token = "<pad>" if is_roberta else "[PAD]"
        mask_token = "<mask>" if is_roberta else "[MASK]"

        max_src_length = self.max_seq_length - 2 - max_tgt_length
        bi_uni_pipeline = []
        bi_uni_pipeline.append(
            seq2seq_loader.Preprocess4Seq2seqDecoder(
                list(vocab.keys()),
                self.tokenizer.convert_tokens_to_ids,
                self.max_seq_length,
                max_tgt_length=max_tgt_length,
                new_segment_ids=new_segment_ids,
                mode=s2s_config.mode,
                num_qkv=s2s_config.num_qkv,
                s2s_special_token=s2s_config.s2s_special_token,
                s2s_add_segment=s2s_config.s2s_add_segment,
                s2s_share_segment=s2s_config.s2s_share_segment,
                pos_shift=s2s_config.pos_shift,
                cls_token=cls_token,
                sep_token=sep_token,
                pad_token=pad_token,
            ))

        def collate_fn(input_batch):
            buf_id = [x[0] for x in input_batch]
            buf = [x[1][:max_src_length] for x in input_batch]
            max_a_len = max([len(x) for x in buf])
            instances = []
            for instance in [(x, max_a_len) for x in buf]:
                for proc in bi_uni_pipeline:
                    instance = proc(instance)
                instances.append(instance)
            batch = seq2seq_loader.batch_list_to_batch_tensors(instances)

            return (batch, buf_id)

        # prepare decoder
        pair_num_relation = 0
        cls_num_labels = 2
        type_vocab_size = (6 + (1 if s2s_config.s2s_add_segment else 0)
                           if new_segment_ids else 2)
        (
            mask_word_id,
            eos_word_ids,
            sos_word_id,
        ) = self.tokenizer.convert_tokens_to_ids(
            [mask_token, sep_token, sep_token])
        forbid_ignore_set = None
        if forbid_ignore_word:
            w_list = []
            for w in forbid_ignore_word.split("|"):
                if w.startswith("[") and w.endswith("]"):
                    w_list.append(w.upper())
                else:
                    w_list.append(w)
            forbid_ignore_set = set(
                self.tokenizer.convert_tokens_to_ids(w_list))

        if hasattr(self.model, "module"):
            state_dict = self.model.module.state_dict()
        else:
            state_dict = self.model.state_dict()

        model = BertForSeq2SeqDecoder.from_pretrained(
            self._bert_model_name,
            state_dict=state_dict,
            num_labels=cls_num_labels,
            num_rel=pair_num_relation,
            type_vocab_size=type_vocab_size,
            task_idx=3,
            mask_word_id=mask_word_id,
            search_beam_size=beam_size,
            length_penalty=length_penalty,
            eos_id=eos_word_ids,
            sos_id=sos_word_id,
            forbid_duplicate_ngrams=forbid_duplicate_ngrams,
            forbid_ignore_set=forbid_ignore_set,
            ngram_size=s2s_config.forbid_ngram_size,
            min_len=s2s_config.min_len,
            mode=s2s_config.mode,
            max_position_embeddings=self.max_seq_length,
            ffn_type=s2s_config.ffn_type,
            num_qkv=s2s_config.num_qkv,
            seg_emb=s2s_config.seg_emb,
            pos_shift=s2s_config.pos_shift,
            is_roberta=is_roberta,
            no_segment_embedding=no_segment_embedding,
        )

        del state_dict

        if fp16:
            model.half()
        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        # # move model
        model = move_model_to_device(model=model, device=device)

        batch_size = per_gpu_batch_size * max(1, num_gpus)

        model = parallelize_model(
            model=model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        # torch.cuda.empty_cache()
        model.eval()
        first_batch = True
        batch_count = 0

        output_lines = [""] * len(test_dataset)
        score_trace_list = [None] * len(test_dataset)

        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(
            test_dataset,
            sampler=test_sampler,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
        for batch, buf_id in tqdm(test_dataloader,
                                  desc="Evaluating",
                                  disable=not verbose):
            batch_count += 1
            with torch.no_grad():
                batch = [
                    t.to(device) if t is not None else None for t in batch
                ]
                (
                    input_ids,
                    token_type_ids,
                    position_ids,
                    input_mask,
                    mask_qkv,
                    task_idx,
                ) = batch
                traces = model(
                    input_ids,
                    token_type_ids,
                    position_ids,
                    input_mask,
                    task_idx=task_idx,
                    mask_qkv=mask_qkv,
                )
                if beam_size > 1:
                    traces = {k: v.tolist() for k, v in traces.items()}
                    output_ids = traces["pred_seq"]
                else:
                    output_ids = traces.tolist()

                for i in range(len(batch[0])):
                    w_ids = output_ids[i]
                    output_buf = self.tokenizer.convert_ids_to_tokens(w_ids)
                    output_tokens = []
                    for t in output_buf:
                        if t in (sep_token, pad_token):
                            break
                        output_tokens.append(t)
                    if is_roberta:
                        output_sequence = self.tokenizer.convert_tokens_to_string(
                            output_tokens)
                    else:
                        output_sequence = " ".join(detokenize(output_tokens))
                    if "\n" in output_sequence:
                        output_sequence = " [X_SEP] ".join(
                            output_sequence.split("\n"))
                    output_lines[buf_id[i]] = output_sequence
                    if first_batch or batch_count % 50 == 0:
                        logger.info("{} = {}".format(buf_id[i],
                                                     output_sequence))
                    if need_score_traces:
                        score_trace_list[buf_id[i]] = {
                            "scores": traces["scores"][i],
                            "wids": traces["wids"][i],
                            "ptrs": traces["ptrs"][i],
                        }

            first_batch = False

        del model
        del batch
        torch.cuda.empty_cache()

        if need_score_traces:
            return output_lines, score_trace_list
        else:
            return output_lines
Ejemplo n.º 22
0
    def fit(
        self,
        train_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=3000,
        local_rank=-1,
        max_steps=5e5,
        warmup_steps=1e5,
        learning_rate=2e-3,
        optimization_method="adam",
        max_grad_norm=0,
        beta1=0.9,
        beta2=0.999,
        decay_method="noam",
        gradient_accumulation_steps=1,
        report_every=50,
        verbose=True,
        seed=None,
        save_every=-1,
        world_size=1,
        rank=0,
        use_preprocessed_data=False,
        **kwargs,
    ):
        """
        Fine-tune pre-trained transofmer models for extractive summarization.

        Args:
            train_dataset (ExtSumProcessedIterableDataset): Training dataset.
            num_gpus (int, optional): The number of GPUs to use.
                If None, all available GPUs will be used. If set to 0 or GPUs are not
                available, CPU device will be used. Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs.
                Defaults to -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps.
                Defaults to 5e5.
            warmup_steps (int, optional): Number of steps taken to increase learning
                rate from 0 to `learning_rate`. Defaults to 1e5.
            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
                Defaults to 5e-5.
            optimization_method (string, optional): Optimization method used in
                fine tuning.
            max_grad_norm (float, optional): Maximum gradient norm for gradient
                clipping.
                Defaults to 0.
            gradient_accumulation_steps (int, optional): Number of batches to accumulate
                gradients on between each model parameter update. Defaults to 1.
            decay_method (string, optional): learning rate decrease method.
                Defaulta to 'noam'.
            report_every (int, optional): The interval by steps to print out the
                trainint log.
                Defaults to 50.
            beta1 (float, optional): The exponential decay rate for the first moment
                estimates.
                Defaults to 0.9.
            beta2 (float, optional): The exponential decay rate for the second-moment
                estimates.
                This value should be set close to 1.0 on problems with a sparse
                gradient.
                Defaults to 0.99.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility.
                Defaults to None.
            rank (int, optional): Global rank of the current GPU in distributed
                training. It's calculated with the rank of the current node in
                the cluster/world and the `local_rank` of the device in the current
                node. See an example in :file: `examples/text_summarization/
                extractive_summarization_cnndm_distributed_train.py`.
                Defaults to 0.
        """

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)
        # move model
        self.model = move_model_to_device(model=self.model, device=device)

        # init optimizer
        optimizer = model_builder.build_optim(
            self.model,
            optimization_method,
            learning_rate,
            max_grad_norm,
            beta1,
            beta2,
            decay_method,
            warmup_steps,
        )
        self.model = parallelize_model(
            model=self.model,
            device=device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        # batch_size is the number of tokens in a batch
        if use_preprocessed_data:
            train_dataloader = get_dataloader(
                train_dataset.get_stream(),
                is_labeled=True,
                batch_size=batch_size,
                world_size=world_size,
                rank=rank,
                local_rank=local_rank,
            )
        else:
            if local_rank == -1:
                sampler = RandomSampler(train_dataset)
            else:
                sampler = DistributedSampler(train_dataset,
                                             num_replicas=world_size,
                                             rank=rank)

            def collate_fn(data):
                return self.processor.collate(data,
                                              block_size=self.max_pos_length,
                                              device=device)

            train_dataloader = DataLoader(
                train_dataset,
                sampler=sampler,
                batch_size=batch_size,
                collate_fn=collate_fn,
            )

        # compute the max number of training steps
        max_steps = compute_training_steps(
            train_dataloader,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=ExtSumProcessor.get_inputs,
            device=device,
            num_gpus=num_gpus,
            max_steps=max_steps,
            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
            scheduler=None,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            clip_grad_norm=False,
            save_every=save_every,
        )
Ejemplo n.º 23
0
    def predict(
        self,
        test_dataset,
        num_gpus=None,
        gpu_ids=None,
        batch_size=16,
        sentence_separator="<q>",
        top_n=3,
        block_trigram=True,
        cal_lead=False,
        verbose=True,
        local_rank=-1,
    ):
        """
        Predict the summarization for the input data iterator.

        Args:
            test_dataset (Dataset): Dataset for which the summary to be predicted
            num_gpus (int, optional): The number of GPUs used in prediction.
                Defaults to 1.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            batch_size (int, optional): The number of test examples in each batch.
                Defaults to 16.
            sentence_separator (str, optional): String to be inserted between
                sentences in the prediction. Defaults to '<q>'.
            top_n (int, optional): The number of sentences that should be selected
                from the paragraph as summary. Defaults to 3.
            block_trigram (bool, optional): voolean value which specifies whether
                the summary should include any sentence that has the same trigram
                as the already selected sentences. Defaults to True.
            cal_lead (bool, optional): Boolean value which specifies whether the
                prediction uses the first few sentences as summary. Defaults to False.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.

        Returns:
            List of strings which are the summaries

        """

        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        def collate_processed_data(dict_list):
            # tuple_batch =  [list(col) for col in zip(*[d.values() for d in dict_list]
            if dict_list is None or len(dict_list) <= 0:
                return None
            tuple_batch = [list(d.values()) for d in dict_list]
            # generate mask and mask_cls, and only select tensors for the model input
            # the labels was never used in prediction, set is_labeled as False
            batch = Batch(tuple_batch, is_labeled=False)
            return batch

        def collate(data):
            return self.processor.collate(data,
                                          block_size=self.max_pos_length,
                                          train_mode=False,
                                          device=device)

        if len(test_dataset) == 0:
            return None
        if "segs" in test_dataset[0]:
            collate_fn = collate_processed_data
        else:
            collate_fn = collate

        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(
            test_dataset,
            sampler=test_sampler,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
        sent_scores = self.predict_scores(test_dataloader,
                                          num_gpus=num_gpus,
                                          gpu_ids=gpu_ids)

        sent_scores_list = list(sent_scores)
        scores_list = []
        for i in sent_scores_list:
            scores_list.extend(i)
        prediction = []
        for i in range(len(test_dataset)):
            temp_pred = get_pred(
                test_dataset[i],
                scores_list[i],
                cal_lead=cal_lead,
                sentence_separator=sentence_separator,
                block_trigram=block_trigram,
                top_n=top_n,
            )
            prediction.extend(temp_pred)

        # release GPU memories
        self.model.cpu()
        torch.cuda.empty_cache()

        return prediction
    def fit(
        self,
        train_loader,
        epoch,
        bert_optimizer=None,
        num_epochs=1,
        num_gpus=None,
        lr=2e-5,
        warmup_proportion=None,
        fp16_allreduce=False,
        num_train_optimization_steps=10,
    ):
        """
        Method to fine-tune the bert classifier using the given training data

        Args:
            train_loader(torch.DataLoader): Torch Dataloader created from Torch Dataset
            epoch(int): Current epoch number of training.
            bert_optimizer(optimizer): optimizer can be BERTAdam for local and Dsitributed if Horovod
            num_epochs(int): the number of epochs to run
            num_gpus(int): the number of gpus. If None is specified, all available GPUs will be used.
            lr (float): learning rate of the adam optimizer. defaults to 2e-5.
            warmup_proportion (float, optional): proportion of training to
                perform linear learning rate warmup for. e.g., 0.1 = 10% of
                training. defaults to none.
            fp16_allreduce(bool): if true, use fp16 compression during allreduce
            num_train_optimization_steps: number of steps the optimizer should take.
        """

        device, num_gpus = get_device(num_gpus)

        self.model = move_to_device(self.model, device, num_gpus)

        if bert_optimizer is None:
            bert_optimizer = self.create_optimizer(
                num_train_optimization_steps=num_train_optimization_steps,
                lr=lr,
                warmup_proportion=warmup_proportion,
                fp16_allreduce=fp16_allreduce,
            )

        if self.use_distributed:
            hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)

        loss_func = nn.CrossEntropyLoss().to(device)

        # train
        self.model.train()  # training mode

        token_type_ids_batch = None

        num_print = 1000
        for batch_idx, data in enumerate(train_loader):

            x_batch = data["token_ids"]
            x_batch = x_batch.cuda()

            y_batch = data["labels"]
            y_batch = y_batch.cuda()

            mask_batch = data["input_mask"]
            mask_batch = mask_batch.cuda()

            if "token_type_ids" in data and data["token_type_ids"] is not None:
                token_type_ids_batch = data["token_type_ids"]
                token_type_ids_batch = token_type_ids_batch.cuda()

            bert_optimizer.zero_grad()

            y_h = self.model(
                input_ids=x_batch,
                token_type_ids=token_type_ids_batch,
                attention_mask=mask_batch,
                labels=None,
            )

            loss = loss_func(y_h, y_batch).mean()
            loss.backward()

            bert_optimizer.synchronize()
            bert_optimizer.step()

            if batch_idx % num_print == 0:
                print(
                    "Train Epoch: {}/{} ({:.0f}%) \t Batch:{} \tLoss: {:.6f}".
                    format(
                        epoch,
                        num_epochs,
                        100.0 * batch_idx / len(train_loader),
                        batch_idx + 1,
                        loss.item(),
                    ))

        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        torch.cuda.empty_cache()
Ejemplo n.º 25
0
    def fine_tune(
        self,
        train_dataloader,
        get_inputs,
        num_gpus=None,
        gpu_ids=None,
        max_steps=-1,
        max_grad_norm=1.0,
        gradient_accumulation_steps=1,
        optimizer=None,
        scheduler=None,
        fp16=False,
        fp16_opt_level="O1",
        local_rank=-1,
        verbose=True,
        seed=None,
        report_every=10,
        clip_grad_norm=True,
    ):

        # get device
        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)

        if seed is not None:
            Transformer.set_seed(seed, num_gpus > 0)

        if fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
            self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)

        # move model
        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)

        # init training
        global_step = 0
        tr_loss = 0.0
        accum_loss = 0
        self.model.train()
        self.model.zero_grad()

        # train
        start = time.time()
        while global_step < max_steps:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose)
            for step, batch in enumerate(epoch_iterator):
                inputs = get_inputs(batch, device, self.model_name)
                outputs = self.model(**inputs)
                loss = outputs[0]

                if num_gpus > 1:
                    loss = loss.mean()
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                accum_loss += loss.item()

                if (step + 1) % gradient_accumulation_steps == 0:
                    global_step += 1

                    if clip_grad_norm:
                        if fp16:
                            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                        else:
                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)

                    if global_step % report_every == 0 and verbose:
                        end = time.time()
                        print(
                            "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format(
                                accum_loss / report_every, end - start, len(batch), global_step, max_steps,
                            )
                        )
                        accum_loss = 0
                        start = end

                    optimizer.step()
                    if scheduler:
                        scheduler.step()
                    self.model.zero_grad()

                if global_step > max_steps:
                    epoch_iterator.close()
                    break

        return global_step, tr_loss / global_step
    def predict(
        self,
        test_dataset,
        num_gpus=None,
        gpu_ids=None,
        local_rank=-1,
        batch_size=16,
        alpha=0.6,
        beam_size=5,
        min_length=15,
        max_length=150,
        fp16=False,
        verbose=True,
    ):
        """
        Predict the summarization for the input data iterator.

        Args:
            test_dataset (SummarizationDataset): Dataset for which the summary
                to be predicted.
            num_gpus (int, optional): The number of GPUs used in prediction.
                Defaults to 1.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
            local_rank (int, optional): Local rank of the device in distributed
                inferencing. Defaults to -1, which means non-distributed inferencing.
            batch_size (int, optional): The number of test examples in each batch.
                Defaults to 16.
            alpha (float, optional): Length penalty. Defaults to 0.6.
            beam_size (int, optional): Beam size of beam search. Defaults to 5.
            min_length (int, optional): Minimum number of tokens in the output sequence.
                Defaults to 15.
            max_length (int, optional):  Maximum number of tokens in output
                sequence. Defaults to 150.
            fp16 (bool, optional): Whether to use half-precision model for prediction.
                Defaults to False.
            verbose (bool, optional): Whether to print out the training log.
                Defaults to True.

        Returns:
            List of strings which are the summaries

        """
        device, num_gpus = get_device(num_gpus=num_gpus,
                                      gpu_ids=gpu_ids,
                                      local_rank=local_rank)

        # move model to devices
        def this_model_move_callback(model, device):
            model = move_model_to_device(model, device)
            return parallelize_model(model,
                                     device,
                                     num_gpus=num_gpus,
                                     gpu_ids=gpu_ids,
                                     local_rank=local_rank)

        if fp16:
            self.model = self.model

        self.model = move_model_to_device(self.model, device)
        self.model.eval()

        predictor = build_predictor(
            self.processor.tokenizer,
            self.processor.symbols,
            self.model,
            alpha=alpha,
            beam_size=beam_size,
            min_length=min_length,
            max_length=max_length,
        )
        predictor = this_model_move_callback(predictor, device)
        self.model = parallelize_model(
            self.model,
            device,
            num_gpus=num_gpus,
            gpu_ids=gpu_ids,
            local_rank=local_rank,
        )

        test_sampler = SequentialSampler(test_dataset)

        def collate_fn(data):
            return self.processor.collate(data,
                                          self.max_pos_length,
                                          device,
                                          train_mode=False)

        test_dataloader = DataLoader(
            test_dataset,
            sampler=test_sampler,
            batch_size=batch_size,
            collate_fn=collate_fn,
        )
        print("dataset length is {}".format(len(test_dataset)))

        def format_summary(translation):
            """ Transforms the output of the `from_batch` function
            into nicely formatted summaries.
            """
            raw_summary = translation
            summary = (raw_summary.replace("[unused0]", "").replace(
                "[unused3]",
                "").replace("[CLS]", "").replace("[SEP]", "").replace(
                    "[PAD]",
                    "").replace("[unused1]", "").replace(r" +", " ").replace(
                        " [unused2] ", ".").replace("[unused2]", "").strip())

            return summary

        def generate_summary_from_tokenid(preds, pred_score):
            batch_size = preds.size()[0]  # batch.batch_size
            translations = []
            for b in range(batch_size):
                if len(preds[b]) < 1:
                    pred_sents = ""
                else:
                    pred_sents = self.processor.tokenizer.convert_ids_to_tokens(
                        [int(n) for n in preds[b] if int(n) != 0])
                    pred_sents = " ".join(pred_sents).replace(" ##", "")
                translations.append(pred_sents)
            return translations

        generated_summaries = []

        for batch in tqdm(test_dataloader,
                          desc="Generating summary",
                          disable=not verbose):
            input = self.processor.get_inputs(batch,
                                              device,
                                              "bert",
                                              train_mode=False)
            translations, scores = predictor(**input)

            translations_text = generate_summary_from_tokenid(
                translations, scores)
            summaries = [format_summary(t) for t in translations_text]
            generated_summaries.extend(summaries)

        # release GPU memories
        # self.model.cpu()
        # torch.cuda.empty_cache()

        return generated_summaries
Ejemplo n.º 27
0
    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        token_type_ids=None,
        num_gpus=None,
        num_epochs=1,
        batch_size=32,
        lr=2e-5,
        warmup_proportion=None,
        verbose=True,
    ):
        """Fine-tunes the BERT classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 32.
            lr (float): Learning rate of the Adam optimizer. Defaults to 2e-5.
            warmup_proportion (float, optional): Proportion of training to
                perform linear learning rate warmup for. E.g., 0.1 = 10% of
                training. Defaults to None.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device, num_gpus = get_device(num_gpus)

        self.model = move_model_to_device(self.model, device, num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids,
                                                 dtype=torch.long)
            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                          token_type_ids_tensor, labels_tensor)
        else:
            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                          labels_tensor)
        train_sampler = RandomSampler(train_dataset)

        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=batch_size)
        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        num_batches = len(train_dataloader)
        num_train_optimization_steps = num_batches * num_epochs

        if warmup_proportion is None:
            opt = BertAdam(optimizer_grouped_parameters, lr=lr)
        else:
            opt = BertAdam(
                optimizer_grouped_parameters,
                lr=lr,
                t_total=num_train_optimization_steps,
                warmup=warmup_proportion,
            )

        # define loss function
        loss_func = nn.CrossEntropyLoss().to(device)

        # train
        self.model.train()  # training mode

        for epoch in range(num_epochs):
            training_loss = 0
            for i, batch in enumerate(tqdm(train_dataloader,
                                           desc="Iteration")):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch)
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(
                        t.to(device) for t in batch)

                opt.zero_grad()

                y_h = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
                loss = loss_func(y_h, y_batch).mean()

                training_loss += loss.item()

                loss.backward()
                opt.step()
                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        print(
                            "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f}"
                            .format(
                                epoch + 1,
                                num_epochs,
                                i + 1,
                                min(i + 1 + num_batches // 10, num_batches),
                                num_batches,
                                training_loss / (i + 1),
                            ))
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        torch.cuda.empty_cache()
Ejemplo n.º 28
0
def test_get_device_all_gpus():
    device, gpus = get_device()
    assert isinstance(device, torch.device)
    assert device.type == "cuda"
    assert gpus == torch.cuda.device_count()
    def predict(self, test_loader, num_gpus=None, probabilities=False):
        """

        Method to predict the results on the test loader. Only evaluates for non distributed
        workload on the head node in a distributed setup.

        Args:
            test_loader(torch Dataloader): Torch Dataloader created from Torch Dataset
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.

        Returns:
            1darray, dict(1darray, 1darray, ndarray): Predicted classes and target labels or
                a dictionary with classes, target labels, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
        self.model = move_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()

        preds = []
        test_labels = []
        for i, data in enumerate(tqdm(test_loader, desc="Iteration")):
            x_batch = data["token_ids"]
            x_batch = x_batch.cuda()

            mask_batch = data["input_mask"]
            mask_batch = mask_batch.cuda()

            y_batch = data["labels"]

            token_type_ids_batch = None
            if "token_type_ids" in data and data["token_type_ids"] is not None:
                token_type_ids_batch = data["token_type_ids"]
                token_type_ids_batch = token_type_ids_batch.cuda()

            with torch.no_grad():
                p_batch = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=None,
                )
            preds.append(p_batch.cpu())
            test_labels.append(y_batch)

        preds = np.concatenate(preds)
        test_labels = np.concatenate(test_labels)

        if probabilities:
            return {
                "Predictions":
                preds.argmax(axis=1),
                "Target":
                test_labels,
                "classes probabilities":
                nn.Softmax(dim=1)(torch.Tensor(preds)).numpy(),
            }
        else:
            return preds.argmax(axis=1), test_labels
Ejemplo n.º 30
0
    def fine_tune(
        self,
        train_dataloader,
        get_inputs,
        max_steps=-1,
        num_train_epochs=1,
        max_grad_norm=1.0,
        gradient_accumulation_steps=1,
        n_gpu=1,
        optimizer=None,
        scheduler=None,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        fp16=False,
        fp16_opt_level="O1",
        local_rank=-1,
        verbose=True,
        seed=None,
    ):

        device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)

        if seed is not None:
            Transformer.set_seed(seed, num_gpus > 0)

        if max_steps > 0:
            t_total = max_steps
            num_train_epochs = (
                max_steps //
                (len(train_dataloader) // gradient_accumulation_steps) + 1)
        else:
            t_total = len(train_dataloader
                          ) // gradient_accumulation_steps * num_train_epochs

        if optimizer is None:
            no_decay = ["bias", "LayerNorm.weight"]
            optimizer_grouped_parameters = [
                {
                    "params": [
                        p for n, p in self.model.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    weight_decay,
                },
                {
                    "params": [
                        p for n, p in self.model.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay":
                    0.0,
                },
            ]
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=learning_rate,
                              eps=adam_epsilon)

        if scheduler is None:
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=t_total)

        if fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex"
                )
            self.model, optimizer = amp.initialize(self.model,
                                                   optimizer,
                                                   opt_level=fp16_opt_level)

        if local_rank != -1:
            self.model = torch.nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[local_rank],
                output_device=local_rank,
                find_unused_parameters=True,
            )
        else:
            if isinstance(self.model, torch.nn.DataParallel):
                self.model = self.model.module

            if num_gpus > 1:
                self.model = torch.nn.DataParallel(self.model,
                                                   device_ids=list(
                                                       range(num_gpus)))

        self.model.to(device)
        self.model.train()

        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()
        train_iterator = trange(int(num_train_epochs),
                                desc="Epoch",
                                disable=local_rank not in [-1, 0]
                                or not verbose)

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader,
                                  desc="Iteration",
                                  disable=local_rank not in [-1, 0]
                                  or not verbose)
            for step, batch in enumerate(epoch_iterator):
                batch = tuple(t.to(device) for t in batch)
                inputs = get_inputs(batch, self.model_name)
                outputs = self.model(**inputs)
                loss = outputs[0]

                if num_gpus > 1:
                    loss = loss.mean()
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                if step % 10 == 0 and verbose:
                    tqdm.write("Loss:{:.6f}".format(loss))

                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   max_grad_norm)

                tr_loss += loss.item()
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    self.model.zero_grad()
                    global_step += 1

                if max_steps > 0 and global_step > max_steps:
                    epoch_iterator.close()
                    break
            if max_steps > 0 and global_step > max_steps:
                train_iterator.close()
                break

            # empty cache
            del [batch]
            torch.cuda.empty_cache()
        return global_step, tr_loss / global_step